llvm.org GIT mirror llvm / 6a62eec
X86: Use push-pop for materializing 8-bit immediates for minsize (take 2) This is the same as r255936, with added logic for avoiding clobbering of the red zone (PR26023). Differential Revision: http://reviews.llvm.org/D18246 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@264375 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 4 years ago
10 changed file(s) with 310 addition(s) and 106 deletion(s). Raw diff Collapse all Expand all
957957 !MF.shouldSplitStack()) { // Regular stack
958958 uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
959959 if (HasFP) MinSize += SlotSize;
960 X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0);
960961 StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
961962 MFI->setStackSize(StackSize);
962963 }
156156 /// performance.
157157 bool OptForSize;
158158
159 /// If true, selector should try to optimize for minimum code size.
160 bool OptForMinSize;
161
159162 public:
160163 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
161 : SelectionDAGISel(tm, OptLevel), OptForSize(false) {}
164 : SelectionDAGISel(tm, OptLevel), OptForSize(false),
165 OptForMinSize(false) {}
162166
163167 const char *getPassName() const override {
164168 return "X86 DAG->DAG Instruction Selection";
529533 }
530534
531535 void X86DAGToDAGISel::PreprocessISelDAG() {
532 // OptForSize is used in pattern predicates that isel is matching.
536 // OptFor[Min]Size are used in pattern predicates that isel is matching.
533537 OptForSize = MF->getFunction()->optForSize();
538 OptForMinSize = MF->getFunction()->optForMinSize();
539 assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
534540
535541 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
536542 E = CurDAG->allnodes_end(); I != E; ) {
249249 // Alias instruction mapping movr0 to xor.
250250 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
251251 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
252 isPseudo = 1 in
252 isPseudo = 1, AddedComplexity = 20 in
253253 def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
254254 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
255255
262262 }
263263
264264 let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
265 AddedComplexity = 1 in {
265 AddedComplexity = 15 in {
266266 // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
267267 // which only require 3 bytes compared to MOV32ri which requires 5.
268268 let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
275275 // MOV16ri is 4 bytes, so the instructions above are smaller.
276276 def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
277277 def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
278 }
279
280 let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
281 // AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
282 // FIXME: Add itinerary class and Schedule.
283 def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
284 [(set GR32:$dst, i32immSExt8:$src)]>,
285 Requires<[OptForMinSize, NotWin64WithoutFP]>;
286 def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
287 [(set GR64:$dst, i64immSExt8:$src)]>,
288 Requires<[OptForMinSize, NotWin64WithoutFP]>;
278289 }
279290
280291 // Materialize i64 constant where top 32-bits are zero. This could theoretically
2222 #include "llvm/CodeGen/MachineDominators.h"
2323 #include "llvm/CodeGen/MachineFrameInfo.h"
2424 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/CodeGen/MachineModuleInfo.h"
2526 #include "llvm/CodeGen/MachineRegisterInfo.h"
2627 #include "llvm/CodeGen/StackMaps.h"
2728 #include "llvm/IR/DerivedTypes.h"
53905391 return true;
53915392 }
53925393
5394 bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const {
5395 MachineBasicBlock &MBB = *MIB->getParent();
5396 DebugLoc DL = MIB->getDebugLoc();
5397 int64_t Imm = MIB->getOperand(1).getImm();
5398 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
5399 MachineBasicBlock::iterator I = MIB.getInstr();
5400
5401 int StackAdjustment;
5402
5403 if (Subtarget.is64Bit()) {
5404 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
5405 MIB->getOpcode() == X86::MOV32ImmSExti8);
5406
5407 // Can't use push/pop lowering if the function might write to the red zone.
5408 X86MachineFunctionInfo *X86FI =
5409 MBB.getParent()->getInfo();
5410 if (X86FI->getUsesRedZone()) {
5411 MIB->setDesc(get(MIB->getOpcode() == X86::MOV32ImmSExti8 ? X86::MOV32ri
5412 : X86::MOV64ri));
5413 return true;
5414 }
5415
5416 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
5417 // widen the register if necessary.
5418 StackAdjustment = 8;
5419 BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm);
5420 MIB->setDesc(get(X86::POP64r));
5421 MIB->getOperand(0)
5422 .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
5423 } else {
5424 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
5425 StackAdjustment = 4;
5426 BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm);
5427 MIB->setDesc(get(X86::POP32r));
5428 }
5429
5430 // Build CFI if necessary.
5431 MachineFunction &MF = *MBB.getParent();
5432 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
5433 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
5434 bool NeedsDwarfCFI =
5435 !IsWin64Prologue &&
5436 (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
5437 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
5438 if (EmitCFI) {
5439 TFL->BuildCFI(MBB, I, DL,
5440 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
5441 TFL->BuildCFI(MBB, std::next(I), DL,
5442 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
5443 }
5444
5445 return true;
5446 }
5447
53935448 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
53945449 // code sequence is needed for other targets.
53955450 static void expandLoadStackGuard(MachineInstrBuilder &MIB,
54225477 return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
54235478 case X86::MOV32r_1:
54245479 return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
5480 case X86::MOV32ImmSExti8:
5481 case X86::MOV64ImmSExti8:
5482 return ExpandMOVImmSExti8(MIB);
54255483 case X86::SETB_C8r:
54265484 return Expand2AddrUndef(MIB, get(X86::SBB8rr));
54275485 case X86::SETB_C16r:
2222 #include "X86GenInstrInfo.inc"
2323
2424 namespace llvm {
25 class MachineInstrBuilder;
2526 class X86RegisterInfo;
2627 class X86Subtarget;
2728
563564 /// operand and follow operands form a reference to the stack frame.
564565 bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
565566 int &FrameIndex) const;
567
568 /// Expand the MOVImmSExti8 pseudo-instructions.
569 bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const;
566570 };
567571
568572 } // End llvm namespace
864864 AssemblerPredicate<"Mode32Bit", "32-bit mode">;
865865 def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
866866 def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
867 def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
868 "Subtarget->getFrameLowering()->hasFP(*MF)">;
867869 def IsPS4 : Predicate<"Subtarget->isTargetPS4()">;
868870 def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">;
869871 def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
877879 def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
878880 def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
879881 def OptForSize : Predicate<"OptForSize">;
882 def OptForMinSize : Predicate<"OptForMinSize">;
880883 def OptForSpeed : Predicate<"!OptForSize">;
881884 def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
882885 def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
9595 /// copies.
9696 bool IsSplitCSR = false;
9797
98 /// True if this function uses the red zone.
99 bool UsesRedZone = false;
100
98101 private:
99102 /// ForwardedMustTailRegParms - A list of virtual and physical registers
100103 /// that must be forwarded to every musttail call.
166169
167170 bool isSplitCSR() const { return IsSplitCSR; }
168171 void setIsSplitCSR(bool s) { IsSplitCSR = s; }
172
173 bool getUsesRedZone() const { return UsesRedZone; }
174 void setUsesRedZone(bool V) { UsesRedZone = V; }
169175 };
170176
171177 } // End llvm namespace
+0
-100
test/CodeGen/X86/materialize-one.ll less more
None ; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
2
3 define i32 @one32() optsize {
4 entry:
5 ret i32 1
6
7 ; CHECK32-LABEL: one32
8 ; CHECK32: xorl %eax, %eax
9 ; CHECK32-NEXT: incl %eax
10 ; CHECK32-NEXT: ret
11
12 ; FIXME: Figure out the best approach in 64-bit mode.
13 ; CHECK64-LABEL: one32
14 ; CHECK64: movl $1, %eax
15 ; CHECK64-NEXT: retq
16 }
17
18 define i32 @minus_one32() optsize {
19 entry:
20 ret i32 -1
21
22 ; CHECK32-LABEL: minus_one32
23 ; CHECK32: xorl %eax, %eax
24 ; CHECK32-NEXT: decl %eax
25 ; CHECK32-NEXT: ret
26 }
27
28 define i16 @one16() optsize {
29 entry:
30 ret i16 1
31
32 ; CHECK32-LABEL: one16
33 ; CHECK32: xorl %eax, %eax
34 ; CHECK32-NEXT: incl %eax
35 ; CHECK32-NEXT: retl
36 }
37
38 define i16 @minus_one16() optsize {
39 entry:
40 ret i16 -1
41
42 ; CHECK32-LABEL: minus_one16
43 ; CHECK32: xorl %eax, %eax
44 ; CHECK32-NEXT: decl %eax
45 ; CHECK32-NEXT: retl
46 }
47
48 define i32 @test_rematerialization() optsize {
49 entry:
50 ; Materialize -1 (thiscall forces it into %ecx).
51 tail call x86_thiscallcc void @f(i32 -1)
52
53 ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
54 ; spilling it to the stack.
55 tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
56
57 ; -1 should be re-materialized here instead of getting spilled above.
58 ret i32 -1
59
60 ; CHECK32-LABEL: test_rematerialization
61 ; CHECK32: xorl %ecx, %ecx
62 ; CHECK32-NEXT: decl %ecx
63 ; CHECK32: calll
64 ; CHECK32: xorl %eax, %eax
65 ; CHECK32-NEXT: decl %eax
66 ; CHECK32-NOT: %eax
67 ; CHECK32: retl
68 }
69
70 define i32 @test_rematerialization2(i32 %x) optsize {
71 entry:
72 ; Materialize -1 (thiscall forces it into %ecx).
73 tail call x86_thiscallcc void @f(i32 -1)
74
75 ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
76 ; spilling it to the stack.
77 tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
78
79 ; Define eflags.
80 %a = icmp ne i32 %x, 123
81 %b = zext i1 %a to i32
82 ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
83 ; It must therefore not use the xor-dec lowering.
84 %c = select i1 %a, i32 %b, i32 -1
85 ret i32 %c
86
87 ; CHECK32-LABEL: test_rematerialization2
88 ; CHECK32: xorl %ecx, %ecx
89 ; CHECK32-NEXT: decl %ecx
90 ; CHECK32: calll
91 ; CHECK32: cmpl
92 ; CHECK32: setne
93 ; CHECK32-NOT: xorl
94 ; CHECK32: movl $-1
95 ; CHECK32: cmov
96 ; CHECK32: retl
97 }
98
99 declare x86_thiscallcc void @f(i32)
0 ; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK32
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECK64
2 ; RUN: llc -mtriple=x86_64-pc-win32 -mattr=+cmov %s -o - | FileCheck %s --check-prefix=CHECKWIN64
3
4 define i32 @one32_nooptsize() {
5 entry:
6 ret i32 1
7
8 ; When not optimizing for size, use mov.
9 ; CHECK32-LABEL: one32_nooptsize:
10 ; CHECK32: movl $1, %eax
11 ; CHECK32-NEXT: retl
12 ; CHECK64-LABEL: one32_nooptsize:
13 ; CHECK64: movl $1, %eax
14 ; CHECK64-NEXT: retq
15 }
16
17 define i32 @one32() optsize {
18 entry:
19 ret i32 1
20
21 ; CHECK32-LABEL: one32:
22 ; CHECK32: xorl %eax, %eax
23 ; CHECK32-NEXT: incl %eax
24 ; CHECK32-NEXT: retl
25
26 ; FIXME: Figure out the best approach in 64-bit mode.
27 ; CHECK64-LABEL: one32:
28 ; CHECK64: movl $1, %eax
29 ; CHECK64-NEXT: retq
30 }
31
32 define i32 @one32_minsize() minsize {
33 entry:
34 ret i32 1
35
36 ; On 32-bit, xor-inc is preferred over push-pop.
37 ; CHECK32-LABEL: one32_minsize:
38 ; CHECK32: xorl %eax, %eax
39 ; CHECK32-NEXT: incl %eax
40 ; CHECK32-NEXT: retl
41
42 ; On 64-bit we don't do xor-inc yet, so push-pop it is. Note that we have to
43 ; pop into a 64-bit register even when we just need 32 bits.
44 ; CHECK64-LABEL: one32_minsize:
45 ; CHECK64: pushq $1
46 ; CHECK64: .cfi_adjust_cfa_offset 8
47 ; CHECK64: popq %rax
48 ; CHECK64: .cfi_adjust_cfa_offset -8
49 ; CHECK64-NEXT: retq
50
51 ; On Win64 we can't adjust the stack unless there's a frame pointer.
52 ; CHECKWIN64-LABEL: one32_minsize:
53 ; CHECKWIN64: movl $1, %eax
54 ; CHECKWIN64-NEXT: retq
55 }
56
57 define i32 @pr26023() minsize {
58 entry:
59 %x = alloca [120 x i8]
60 %0 = getelementptr inbounds [120 x i8], [120 x i8]* %x, i64 0, i64 0
61 call void asm sideeffect "", "imr,~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %0)
62 %arrayidx = getelementptr inbounds [120 x i8], [120 x i8]* %x, i64 0, i64 119
63 store volatile i8 -2, i8* %arrayidx
64 call void asm sideeffect "", "r,~{dirflag},~{fpsr},~{flags}"(i32 5)
65 %1 = load volatile i8, i8* %arrayidx
66 %conv = sext i8 %1 to i32
67 ret i32 %conv
68
69 ; The function writes to the redzone, so push/pop cannot be used.
70 ; CHECK64-LABEL: pr26023:
71 ; CHECK64: movl $5, %ecx
72 ; CHECK64: retq
73
74 ; 32-bit X86 doesn't have a redzone.
75 ; CHECK32-LABEL: pr26023:
76 ; CHECK32: pushl $5
77 ; CHECK32: popl %ecx
78 ; CHECK32: retl
79 }
80
81
82 define i64 @one64_minsize() minsize {
83 entry:
84 ret i64 1
85 ; On 64-bit we don't do xor-inc yet, so push-pop it is.
86 ; CHECK64-LABEL: one64_minsize:
87 ; CHECK64: pushq $1
88 ; CHECK64: .cfi_adjust_cfa_offset 8
89 ; CHECK64: popq %rax
90 ; CHECK64: .cfi_adjust_cfa_offset -8
91 ; CHECK64-NEXT: retq
92
93 ; On Win64 we can't adjust the stack unless there's a frame pointer.
94 ; CHECKWIN64-LABEL: one64_minsize:
95 ; CHECKWIN64: movl $1, %eax
96 ; CHECKWIN64-NEXT: retq
97 }
98
99 define i32 @minus_one32() optsize {
100 entry:
101 ret i32 -1
102
103 ; CHECK32-LABEL: minus_one32:
104 ; CHECK32: xorl %eax, %eax
105 ; CHECK32-NEXT: decl %eax
106 ; CHECK32-NEXT: retl
107 }
108
109 define i32 @minus_one32_minsize() minsize {
110 entry:
111 ret i32 -1
112
113 ; xor-dec is preferred over push-pop.
114 ; CHECK32-LABEL: minus_one32_minsize:
115 ; CHECK32: xorl %eax, %eax
116 ; CHECK32-NEXT: decl %eax
117 ; CHECK32-NEXT: retl
118 }
119
120 define i16 @one16() optsize {
121 entry:
122 ret i16 1
123
124 ; CHECK32-LABEL: one16:
125 ; CHECK32: xorl %eax, %eax
126 ; CHECK32-NEXT: incl %eax
127 ; CHECK32-NEXT: retl
128 }
129
130 define i16 @minus_one16() optsize {
131 entry:
132 ret i16 -1
133
134 ; CHECK32-LABEL: minus_one16:
135 ; CHECK32: xorl %eax, %eax
136 ; CHECK32-NEXT: decl %eax
137 ; CHECK32-NEXT: retl
138 }
139
140 define i32 @minus_five32() minsize {
141 entry:
142 ret i32 -5
143
144 ; CHECK32-LABEL: minus_five32:
145 ; CHECK32: pushl $-5
146 ; CHECK32: popl %eax
147 ; CHECK32: retl
148 }
149
150 define i64 @minus_five64() minsize {
151 entry:
152 ret i64 -5
153
154 ; CHECK64-LABEL: minus_five64:
155 ; CHECK64: pushq $-5
156 ; CHECK64: .cfi_adjust_cfa_offset 8
157 ; CHECK64: popq %rax
158 ; CHECK64: .cfi_adjust_cfa_offset -8
159 ; CHECK64: retq
160 }
161
162 define i32 @rematerialize_minus_one() optsize {
163 entry:
164 ; Materialize -1 (thiscall forces it into %ecx).
165 tail call x86_thiscallcc void @f(i32 -1)
166
167 ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
168 ; spilling it to the stack.
169 tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
170
171 ; -1 should be re-materialized here instead of getting spilled above.
172 ret i32 -1
173
174 ; CHECK32-LABEL: rematerialize_minus_one
175 ; CHECK32: xorl %ecx, %ecx
176 ; CHECK32-NEXT: decl %ecx
177 ; CHECK32: calll
178 ; CHECK32: xorl %eax, %eax
179 ; CHECK32-NEXT: decl %eax
180 ; CHECK32-NOT: %eax
181 ; CHECK32: retl
182 }
183
184 define i32 @rematerialize_minus_one_eflags(i32 %x) optsize {
185 entry:
186 ; Materialize -1 (thiscall forces it into %ecx).
187 tail call x86_thiscallcc void @f(i32 -1)
188
189 ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
190 ; spilling it to the stack.
191 tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
192
193 ; Define eflags.
194 %a = icmp ne i32 %x, 123
195 %b = zext i1 %a to i32
196 ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
197 ; It must therefore not use the xor-dec lowering.
198 %c = select i1 %a, i32 %b, i32 -1
199 ret i32 %c
200
201 ; CHECK32-LABEL: rematerialize_minus_one_eflags
202 ; CHECK32: xorl %ecx, %ecx
203 ; CHECK32-NEXT: decl %ecx
204 ; CHECK32: calll
205 ; CHECK32: cmpl
206 ; CHECK32: setne
207 ; CHECK32-NOT: xorl
208 ; CHECK32: movl $-1
209 ; CHECK32: cmov
210 ; CHECK32: retl
211 }
212
213 declare x86_thiscallcc void @f(i32)
2828 define double @pow_wrapper_minsize(double %a) minsize {
2929 ; CHECK-LABEL: pow_wrapper_minsize:
3030 ; CHECK: # BB#0:
31 ; CHECK-NEXT: movl $15, %edi
32 ; CHECK-NEXT: jmp
31 ; CHECK-NEXT: pushq $15
32 ; CHECK: popq %rdi
33 ; CHECK: jmp
3334 %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; [#uses=1]
3435 ret double %ret
3536 }