llvm.org GIT mirror llvm / 59d9986
[X86] Convert esp-relative movs of function arguments to pushes, step 2 This moves the transformation introduced in r223757 into a separate MI pass. This allows it to cover many more cases (not only cases where there must be a reserved call frame), and perform rudimentary call folding. It still doesn't have a heuristic, so it is enabled only for optsize/minsize, with stack alignment <= 8, where it ought to be a fairly clear win. Differential Revision: http://reviews.llvm.org/D6789 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227728 91177308-0d34-0410-b5e6-96231b3b80d8 Michael Kuperstein 5 years ago
17 changed file(s) with 8049 addition(s) and 7489 deletion(s). Raw diff Collapse all Expand all
192192 return hasReservedCallFrame(MF) || hasFP(MF);
193193 }
194194
195 // needsFrameIndexResolution - Do we need to perform FI resolution for
196 // this function. Normally, this is required only when the function
197 // has any stack objects. However, targets may want to override this.
198 virtual bool needsFrameIndexResolution(const MachineFunction &MF) const;
199
195200 /// getFrameIndexOffset - Returns the displacement from the frame register to
196201 /// the stack frame of the specified index.
197202 virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
702702 /// register references and actual offsets.
703703 ///
704704 void PEI::replaceFrameIndices(MachineFunction &Fn) {
705 if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
705 const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
706 if (!TFI.needsFrameIndexResolution(Fn)) return;
706707
707708 // Store SPAdj at exit of a basic block.
708709 SmallVector SPState;
767768 I = std::next(PrevI);
768769 continue;
769770 }
770
771 // If we are looking at a call sequence, we need to keep track of
772 // the SP adjustment made by each instruction in the sequence.
773 // This includes both the frame setup/destroy pseudos (handled above),
774 // as well as other instructions that have side effects w.r.t the SP.
775 if (InsideCallSequence)
776 SPAdj += TII.getSPAdjust(I);
777771
778772 MachineInstr *MI = I;
779773 bool DoIncr = true;
853847 break;
854848 }
855849
850 // If we are looking at a call sequence, we need to keep track of
851 // the SP adjustment made by each instruction in the sequence.
852 // This includes both the frame setup/destroy pseudos (handled above),
853 // as well as other instructions that have side effects w.r.t the SP.
854 // Note that this must come after eliminateFrameIndex, because
855 // if I itself referred to a frame index, we shouldn't count its own
856 // adjustment.
857 if (MI && InsideCallSequence)
858 SPAdj += TII.getSPAdjust(MI);
859
856860 if (DoIncr && I != BB->end()) ++I;
857861
858862 // Update register states.
4141 FrameReg = RI->getFrameRegister(MF);
4242 return getFrameIndexOffset(MF, FI);
4343 }
44
45 bool TargetFrameLowering::needsFrameIndexResolution(
46 const MachineFunction &MF) const {
47 return MF.getFrameInfo()->hasStackObjects();
48 }
None set(LLVM_TARGET_DEFINITIONS X86.td)
1
2 tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
3 tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler)
4 tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info)
5 tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer)
6 tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
7 tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
8 tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
9 tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
10 tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
11 tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
12 add_public_tablegen_target(X86CommonTableGen)
13
14 set(sources
15 X86AsmPrinter.cpp
16 X86FastISel.cpp
17 X86FloatingPoint.cpp
18 X86FrameLowering.cpp
19 X86ISelDAGToDAG.cpp
20 X86ISelLowering.cpp
21 X86InstrInfo.cpp
22 X86MCInstLower.cpp
23 X86MachineFunctionInfo.cpp
24 X86PadShortFunction.cpp
25 X86RegisterInfo.cpp
26 X86SelectionDAGInfo.cpp
27 X86Subtarget.cpp
28 X86TargetMachine.cpp
29 X86TargetObjectFile.cpp
30 X86TargetTransformInfo.cpp
31 X86VZeroUpper.cpp
32 X86FixupLEAs.cpp
33 )
34
35 if( CMAKE_CL_64 )
36 enable_language(ASM_MASM)
37 ADD_CUSTOM_COMMAND(
38 OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj
39 MAIN_DEPENDENCY X86CompilationCallback_Win64.asm
40 COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
41 )
42 set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj)
43 endif()
44
45 add_llvm_target(X86CodeGen ${sources})
46
47 add_subdirectory(AsmParser)
48 add_subdirectory(Disassembler)
49 add_subdirectory(InstPrinter)
50 add_subdirectory(MCTargetDesc)
51 add_subdirectory(TargetInfo)
52 add_subdirectory(Utils)
0 set(LLVM_TARGET_DEFINITIONS X86.td)
1
2 tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
3 tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler)
4 tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info)
5 tablegen(LLVM X86GenAsmWriter.inc -gen-asm-writer)
6 tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
7 tablegen(LLVM X86GenAsmMatcher.inc -gen-asm-matcher)
8 tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
9 tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
10 tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
11 tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
12 add_public_tablegen_target(X86CommonTableGen)
13
14 set(sources
15 X86AsmPrinter.cpp
16 X86CallFrameOptimization.cpp
17 X86FastISel.cpp
18 X86FloatingPoint.cpp
19 X86FrameLowering.cpp
20 X86ISelDAGToDAG.cpp
21 X86ISelLowering.cpp
22 X86InstrInfo.cpp
23 X86MCInstLower.cpp
24 X86MachineFunctionInfo.cpp
25 X86PadShortFunction.cpp
26 X86RegisterInfo.cpp
27 X86SelectionDAGInfo.cpp
28 X86Subtarget.cpp
29 X86TargetMachine.cpp
30 X86TargetObjectFile.cpp
31 X86TargetTransformInfo.cpp
32 X86VZeroUpper.cpp
33 X86FixupLEAs.cpp
34 )
35
36 if( CMAKE_CL_64 )
37 enable_language(ASM_MASM)
38 ADD_CUSTOM_COMMAND(
39 OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj
40 MAIN_DEPENDENCY X86CompilationCallback_Win64.asm
41 COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
42 )
43 set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj)
44 endif()
45
46 add_llvm_target(X86CodeGen ${sources})
47
48 add_subdirectory(AsmParser)
49 add_subdirectory(Disassembler)
50 add_subdirectory(InstPrinter)
51 add_subdirectory(MCTargetDesc)
52 add_subdirectory(TargetInfo)
53 add_subdirectory(Utils)
6363 /// to eliminate execution delays in some Atom processors.
6464 FunctionPass *createX86FixupLEAs();
6565
66 /// createX86CallFrameOptimization - Return a pass that optimizes
67 /// the code-size of x86 call sequences. This is done by replacing
68 /// esp-relative movs with pushes.
69 FunctionPass *createX86CallFrameOptimization();
70
6671 } // End llvm namespace
6772
6873 #endif
0 //===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines a pass that optimizes call sequences on x86.
10 // Currently, it converts movs of function parameters onto the stack into
11 // pushes. This is beneficial for two main reasons:
12 // 1) The push instruction encoding is much smaller than an esp-relative mov
13 // 2) It is possible to push memory arguments directly. So, if the
14 // the transformation is preformed pre-reg-alloc, it can help relieve
15 // register pressure.
16 //
17 //===----------------------------------------------------------------------===//
18
19 #include
20
21 #include "X86.h"
22 #include "X86InstrInfo.h"
23 #include "X86Subtarget.h"
24 #include "X86MachineFunctionInfo.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstrBuilder.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/Passes.h"
30 #include "llvm/IR/Function.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 #include "llvm/Target/TargetInstrInfo.h"
34
35 using namespace llvm;
36
37 #define DEBUG_TYPE "x86-cf-opt"
38
39 cl::opt NoX86CFOpt("no-x86-call-frame-opt",
40 cl::desc("Avoid optimizing x86 call frames for size"),
41 cl::init(false), cl::Hidden);
42
43 namespace {
44 class X86CallFrameOptimization : public MachineFunctionPass {
45 public:
46 X86CallFrameOptimization() : MachineFunctionPass(ID) {}
47
48 bool runOnMachineFunction(MachineFunction &MF) override;
49
50 private:
51 bool shouldPerformTransformation(MachineFunction &MF);
52
53 bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
54 MachineBasicBlock::iterator I);
55
56 MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
57 unsigned Reg);
58
59 const char *getPassName() const override {
60 return "X86 Optimize Call Frame";
61 }
62
63 const TargetInstrInfo *TII;
64 const TargetFrameLowering *TFL;
65 const MachineRegisterInfo *MRI;
66 static char ID;
67 };
68
69 char X86CallFrameOptimization::ID = 0;
70 }
71
72 FunctionPass *llvm::createX86CallFrameOptimization() {
73 return new X86CallFrameOptimization();
74 }
75
76 // This checks whether the transformation is legal and profitable
77 bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
78 if (NoX86CFOpt.getValue())
79 return false;
80
81 // We currently only support call sequences where *all* parameters.
82 // are passed on the stack.
83 // No point in running this in 64-bit mode, since some arguments are
84 // passed in-register in all common calling conventions, so the pattern
85 // we're looking for will never match.
86 const X86Subtarget &STI = MF.getTarget().getSubtarget();
87 if (STI.is64Bit())
88 return false;
89
90 // You would expect straight-line code between call-frame setup and
91 // call-frame destroy. You would be wrong. There are circumstances (e.g.
92 // CMOV_GR8 expansion of a select that feeds a function call!) where we can
93 // end up with the setup and the destroy in different basic blocks.
94 // This is bad, and breaks SP adjustment.
95 // So, check that all of the frames in the function are closed inside
96 // the same block, and, for good measure, that there are no nested frames.
97 int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
98 int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
99 for (MachineBasicBlock &BB : MF) {
100 bool InsideFrameSequence = false;
101 for (MachineInstr &MI : BB) {
102 if (MI.getOpcode() == FrameSetupOpcode) {
103 if (InsideFrameSequence)
104 return false;
105 InsideFrameSequence = true;
106 }
107 else if (MI.getOpcode() == FrameDestroyOpcode) {
108 if (!InsideFrameSequence)
109 return false;
110 InsideFrameSequence = false;
111 }
112 }
113
114 if (InsideFrameSequence)
115 return false;
116 }
117
118 // Now that we know the transformation is legal, check if it is
119 // profitable.
120 // TODO: Add a heuristic that actually looks at the function,
121 // and enable this for more cases.
122
123 // This transformation is always a win when we expected to have
124 // a reserved call frame. Under other circumstances, it may be either
125 // a win or a loss, and requires a heuristic.
126 // For now, enable it only for the relatively clear win cases.
127 bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
128 if (CannotReserveFrame)
129 return true;
130
131 // For now, don't even try to evaluate the profitability when
132 // not optimizing for size.
133 AttributeSet FnAttrs = MF.getFunction()->getAttributes();
134 bool OptForSize =
135 FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
136 Attribute::OptimizeForSize) ||
137 FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
138
139 if (!OptForSize)
140 return false;
141
142 // Stack re-alignment can make this unprofitable even in terms of size.
143 // As mentioned above, a better heuristic is needed. For now, don't do this
144 // when the required alignment is above 8. (4 would be the safe choice, but
145 // some experimentation showed 8 is generally good).
146 if (TFL->getStackAlignment() > 8)
147 return false;
148
149 return true;
150 }
151
152 bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
153 TII = MF.getSubtarget().getInstrInfo();
154 TFL = MF.getSubtarget().getFrameLowering();
155 MRI = &MF.getRegInfo();
156
157 if (!shouldPerformTransformation(MF))
158 return false;
159
160 int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
161
162 bool Changed = false;
163
164 for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
165 for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
166 if (I->getOpcode() == FrameSetupOpcode)
167 Changed |= adjustCallSequence(MF, *BB, I);
168
169 return Changed;
170 }
171
172 bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
173 MachineBasicBlock &MBB,
174 MachineBasicBlock::iterator I) {
175
176 // Check that this particular call sequence is amenable to the
177 // transformation.
178 const X86RegisterInfo &RegInfo = *static_cast(
179 MF.getSubtarget().getRegisterInfo());
180 unsigned StackPtr = RegInfo.getStackRegister();
181 int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
182
183 // We expect to enter this at the beginning of a call sequence
184 assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
185 MachineBasicBlock::iterator FrameSetup = I++;
186
187
188 // For globals in PIC mode, we can have some LEAs here.
189 // Ignore them, they don't bother us.
190 // TODO: Extend this to something that covers more cases.
191 while (I->getOpcode() == X86::LEA32r)
192 ++I;
193
194 // We expect a copy instruction here.
195 // TODO: The copy instruction is a lowering artifact.
196 // We should also support a copy-less version, where the stack
197 // pointer is used directly.
198 if (!I->isCopy() || !I->getOperand(0).isReg())
199 return false;
200 MachineBasicBlock::iterator SPCopy = I++;
201 StackPtr = SPCopy->getOperand(0).getReg();
202
203 // Scan the call setup sequence for the pattern we're looking for.
204 // We only handle a simple case - a sequence of MOV32mi or MOV32mr
205 // instructions, that push a sequence of 32-bit values onto the stack, with
206 // no gaps between them.
207 SmallVector MovVector(4, nullptr);
208 unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
209 if (MaxAdjust > 4)
210 MovVector.resize(MaxAdjust, nullptr);
211
212 do {
213 int Opcode = I->getOpcode();
214 if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
215 break;
216
217 // We only want movs of the form:
218 // movl imm/r32, k(%esp)
219 // If we run into something else, bail.
220 // Note that AddrBaseReg may, counter to its name, not be a register,
221 // but rather a frame index.
222 // TODO: Support the fi case. This should probably work now that we
223 // have the infrastructure to track the stack pointer within a call
224 // sequence.
225 if (!I->getOperand(X86::AddrBaseReg).isReg() ||
226 (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
227 !I->getOperand(X86::AddrScaleAmt).isImm() ||
228 (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
229 (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
230 (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
231 !I->getOperand(X86::AddrDisp).isImm())
232 return false;
233
234 int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
235 assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
236
237 // We really don't want to consider the unaligned case.
238 if (StackDisp % 4)
239 return false;
240 StackDisp /= 4;
241
242 assert((size_t)StackDisp < MovVector.size() &&
243 "Function call has more parameters than the stack is adjusted for.");
244
245 // If the same stack slot is being filled twice, something's fishy.
246 if (MovVector[StackDisp] != nullptr)
247 return false;
248 MovVector[StackDisp] = I;
249
250 ++I;
251 } while (I != MBB.end());
252
253 // We now expect the end of the sequence - a call and a stack adjust.
254 if (I == MBB.end())
255 return false;
256
257 // For PCrel calls, we expect an additional COPY of the basereg.
258 // If we find one, skip it.
259 if (I->isCopy()) {
260 if (I->getOperand(1).getReg() ==
261 MF.getInfo()->getGlobalBaseReg())
262 ++I;
263 else
264 return false;
265 }
266
267 if (!I->isCall())
268 return false;
269 MachineBasicBlock::iterator Call = I;
270 if ((++I)->getOpcode() != FrameDestroyOpcode)
271 return false;
272
273 // Now, go through the vector, and see that we don't have any gaps,
274 // but only a series of 32-bit MOVs.
275
276 int64_t ExpectedDist = 0;
277 auto MMI = MovVector.begin(), MME = MovVector.end();
278 for (; MMI != MME; ++MMI, ExpectedDist += 4)
279 if (*MMI == nullptr)
280 break;
281
282 // If the call had no parameters, do nothing
283 if (!ExpectedDist)
284 return false;
285
286 // We are either at the last parameter, or a gap.
287 // Make sure it's not a gap
288 for (; MMI != MME; ++MMI)
289 if (*MMI != nullptr)
290 return false;
291
292 // Ok, we can in fact do the transformation for this call.
293 // Do not remove the FrameSetup instruction, but adjust the parameters.
294 // PEI will end up finalizing the handling of this.
295 FrameSetup->getOperand(1).setImm(ExpectedDist);
296
297 DebugLoc DL = I->getDebugLoc();
298 // Now, iterate through the vector in reverse order, and replace the movs
299 // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
300 // replace uses.
301 for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
302 MachineBasicBlock::iterator MOV = *MovVector[Idx];
303 MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
304 if (MOV->getOpcode() == X86::MOV32mi) {
305 unsigned PushOpcode = X86::PUSHi32;
306 // If the operand is a small (8-bit) immediate, we can use a
307 // PUSH instruction with a shorter encoding.
308 // Note that isImm() may fail even though this is a MOVmi, because
309 // the operand can also be a symbol.
310 if (PushOp.isImm()) {
311 int64_t Val = PushOp.getImm();
312 if (isInt<8>(Val))
313 PushOpcode = X86::PUSH32i8;
314 }
315 BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
316 } else {
317 unsigned int Reg = PushOp.getReg();
318
319 // If PUSHrmm is not slow on this target, try to fold the source of the
320 // push into the instruction.
321 const X86Subtarget &ST = MF.getTarget().getSubtarget();
322 bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
323
324 // Check that this is legal to fold. Right now, we're extremely
325 // conservative about that.
326 MachineInstr *DefMov = nullptr;
327 if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
328 MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
329
330 unsigned NumOps = DefMov->getDesc().getNumOperands();
331 for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
332 Push->addOperand(DefMov->getOperand(i));
333
334 DefMov->eraseFromParent();
335 } else {
336 BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
337 }
338 }
339
340 MBB.erase(MOV);
341 }
342
343 // The stack-pointer copy is no longer used in the call sequences.
344 // There should not be any other users, but we can't commit to that, so:
345 if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
346 SPCopy->eraseFromParent();
347
348 // Once we've done this, we need to make sure PEI doesn't assume a reserved
349 // frame.
350 X86MachineFunctionInfo *FuncInfo = MF.getInfo();
351 FuncInfo->setHasPushSequences(true);
352
353 return true;
354 }
355
356 MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
357 MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
358 // Do an extremely restricted form of load folding.
359 // ISel will often create patterns like:
360 // movl 4(%edi), %eax
361 // movl 8(%edi), %ecx
362 // movl 12(%edi), %edx
363 // movl %edx, 8(%esp)
364 // movl %ecx, 4(%esp)
365 // movl %eax, (%esp)
366 // call
367 // Get rid of those with prejudice.
368 if (!TargetRegisterInfo::isVirtualRegister(Reg))
369 return nullptr;
370
371 // Make sure this is the only use of Reg.
372 if (!MRI->hasOneNonDBGUse(Reg))
373 return nullptr;
374
375 MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
376
377 // Make sure the def is a MOV from memory.
378 // If the def is an another block, give up.
379 if (DefMI->getOpcode() != X86::MOV32rm ||
380 DefMI->getParent() != FrameSetup->getParent())
381 return nullptr;
382
383 // Be careful with movs that load from a stack slot, since it may get
384 // resolved incorrectly.
385 // TODO: Again, we already have the infrastructure, so this should work.
386 if (!DefMI->getOperand(1).isReg())
387 return nullptr;
388
389 // Now, make sure everything else up until the ADJCALLSTACK is a sequence
390 // of MOVs. To be less conservative would require duplicating a lot of the
391 // logic from PeepholeOptimizer.
392 // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
393 // to be smarter about folding into pushes.
394 for (auto I = DefMI; I != FrameSetup; ++I)
395 if (I->getOpcode() != X86::MOV32rm)
396 return nullptr;
397
398 return DefMI;
399 }
None //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the X86-specific support for the FastISel class. Much
10 // of the target-specific code is generated by tablegen in the file
11 // X86GenFastISel.inc, which is #included here.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "X86.h"
16 #include "X86CallingConv.h"
17 #include "X86InstrBuilder.h"
18 #include "X86InstrInfo.h"
19 #include "X86MachineFunctionInfo.h"
20 #include "X86RegisterInfo.h"
21 #include "X86Subtarget.h"
22 #include "X86TargetMachine.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/CodeGen/Analysis.h"
25 #include "llvm/CodeGen/FastISel.h"
26 #include "llvm/CodeGen/FunctionLoweringInfo.h"
27 #include "llvm/CodeGen/MachineConstantPool.h"
28 #include "llvm/CodeGen/MachineFrameInfo.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/IR/CallSite.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/GetElementPtrTypeIterator.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/GlobalVariable.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/IntrinsicInst.h"
38 #include "llvm/IR/Operator.h"
39 #include "llvm/Support/ErrorHandling.h"
40 #include "llvm/Target/TargetOptions.h"
41 using namespace llvm;
42
43 namespace {
44
45 class X86FastISel final : public FastISel {
46 /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
47 /// make the right decision when generating code for different targets.
48 const X86Subtarget *Subtarget;
49
50 /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
51 /// floating point ops.
52 /// When SSE is available, use it for f32 operations.
53 /// When SSE2 is available, use it for f64 operations.
54 bool X86ScalarSSEf64;
55 bool X86ScalarSSEf32;
56
57 public:
58 explicit X86FastISel(FunctionLoweringInfo &funcInfo,
59 const TargetLibraryInfo *libInfo)
60 : FastISel(funcInfo, libInfo) {
61 Subtarget = &TM.getSubtarget();
62 X86ScalarSSEf64 = Subtarget->hasSSE2();
63 X86ScalarSSEf32 = Subtarget->hasSSE1();
64 }
65
66 bool fastSelectInstruction(const Instruction *I) override;
67
68 /// \brief The specified machine instr operand is a vreg, and that
69 /// vreg is being provided by the specified load instruction. If possible,
70 /// try to fold the load as an operand to the instruction, returning true if
71 /// possible.
72 bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
73 const LoadInst *LI) override;
74
75 bool fastLowerArguments() override;
76 bool fastLowerCall(CallLoweringInfo &CLI) override;
77 bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
78
79 #include "X86GenFastISel.inc"
80
81 private:
82 bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT, DebugLoc DL);
83
84 bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
85 unsigned &ResultReg);
86
87 bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
88 MachineMemOperand *MMO = nullptr, bool Aligned = false);
89 bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
90 const X86AddressMode &AM,
91 MachineMemOperand *MMO = nullptr, bool Aligned = false);
92
93 bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
94 unsigned &ResultReg);
95
96 bool X86SelectAddress(const Value *V, X86AddressMode &AM);
97 bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
98
99 bool X86SelectLoad(const Instruction *I);
100
101 bool X86SelectStore(const Instruction *I);
102
103 bool X86SelectRet(const Instruction *I);
104
105 bool X86SelectCmp(const Instruction *I);
106
107 bool X86SelectZExt(const Instruction *I);
108
109 bool X86SelectBranch(const Instruction *I);
110
111 bool X86SelectShift(const Instruction *I);
112
113 bool X86SelectDivRem(const Instruction *I);
114
115 bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
116
117 bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
118
119 bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
120
121 bool X86SelectSelect(const Instruction *I);
122
123 bool X86SelectTrunc(const Instruction *I);
124
125 bool X86SelectFPExt(const Instruction *I);
126 bool X86SelectFPTrunc(const Instruction *I);
127
128 const X86InstrInfo *getInstrInfo() const {
129 return getTargetMachine()->getSubtargetImpl()->getInstrInfo();
130 }
131 const X86TargetMachine *getTargetMachine() const {
132 return static_cast(&TM);
133 }
134
135 bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
136
137 unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
138 unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
139 unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
140 unsigned fastMaterializeConstant(const Constant *C) override;
141
142 unsigned fastMaterializeAlloca(const AllocaInst *C) override;
143
144 unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
145
146 /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
147 /// computed in an SSE register, not on the X87 floating point stack.
148 bool isScalarFPTypeInSSEReg(EVT VT) const {
149 return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
150 (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
151 }
152
153 bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
154
155 bool IsMemcpySmall(uint64_t Len);
156
157 bool TryEmitSmallMemcpy(X86AddressMode DestAM,
158 X86AddressMode SrcAM, uint64_t Len);
159
160 bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
161 const Value *Cond);
162 };
163
164 } // end anonymous namespace.
165
166 static std::pair
167 getX86ConditionCode(CmpInst::Predicate Predicate) {
168 X86::CondCode CC = X86::COND_INVALID;
169 bool NeedSwap = false;
170 switch (Predicate) {
171 default: break;
172 // Floating-point Predicates
173 case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
174 case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
175 case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
176 case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
177 case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
178 case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
179 case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
180 case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
181 case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
182 case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
183 case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
184 case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
185 case CmpInst::FCMP_OEQ: // fall-through
186 case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
187
188 // Integer Predicates
189 case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
190 case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
191 case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
192 case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
193 case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
194 case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
195 case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
196 case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
197 case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
198 case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
199 }
200
201 return std::make_pair(CC, NeedSwap);
202 }
203
204 static std::pair
205 getX86SSEConditionCode(CmpInst::Predicate Predicate) {
206 unsigned CC;
207 bool NeedSwap = false;
208
209 // SSE Condition code mapping:
210 // 0 - EQ
211 // 1 - LT
212 // 2 - LE
213 // 3 - UNORD
214 // 4 - NEQ
215 // 5 - NLT
216 // 6 - NLE
217 // 7 - ORD
218 switch (Predicate) {
219 default: llvm_unreachable("Unexpected predicate");
220 case CmpInst::FCMP_OEQ: CC = 0; break;
221 case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
222 case CmpInst::FCMP_OLT: CC = 1; break;
223 case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
224 case CmpInst::FCMP_OLE: CC = 2; break;
225 case CmpInst::FCMP_UNO: CC = 3; break;
226 case CmpInst::FCMP_UNE: CC = 4; break;
227 case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
228 case CmpInst::FCMP_UGE: CC = 5; break;
229 case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
230 case CmpInst::FCMP_UGT: CC = 6; break;
231 case CmpInst::FCMP_ORD: CC = 7; break;
232 case CmpInst::FCMP_UEQ:
233 case CmpInst::FCMP_ONE: CC = 8; break;
234 }
235
236 return std::make_pair(CC, NeedSwap);
237 }
238
239 /// \brief Check if it is possible to fold the condition from the XALU intrinsic
240 /// into the user. The condition code will only be updated on success.
241 bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
242 const Value *Cond) {
243 if (!isa(Cond))
244 return false;
245
246 const auto *EV = cast(Cond);
247 if (!isa(EV->getAggregateOperand()))
248 return false;
249
250 const auto *II = cast(EV->getAggregateOperand());
251 MVT RetVT;
252 const Function *Callee = II->getCalledFunction();
253 Type *RetTy =
254 cast(Callee->getReturnType())->getTypeAtIndex(0U);
255 if (!isTypeLegal(RetTy, RetVT))
256 return false;
257
258 if (RetVT != MVT::i32 && RetVT != MVT::i64)
259 return false;
260
261 X86::CondCode TmpCC;
262 switch (II->getIntrinsicID()) {
263 default: return false;
264 case Intrinsic::sadd_with_overflow:
265 case Intrinsic::ssub_with_overflow:
266 case Intrinsic::smul_with_overflow:
267 case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
268 case Intrinsic::uadd_with_overflow:
269 case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
270 }
271
272 // Check if both instructions are in the same basic block.
273 if (II->getParent() != I->getParent())
274 return false;
275
276 // Make sure nothing is in the way
277 BasicBlock::const_iterator Start = I;
278 BasicBlock::const_iterator End = II;
279 for (auto Itr = std::prev(Start); Itr != End; --Itr) {
280 // We only expect extractvalue instructions between the intrinsic and the
281 // instruction to be selected.
282 if (!isa(Itr))
283 return false;
284
285 // Check that the extractvalue operand comes from the intrinsic.
286 const auto *EVI = cast(Itr);
287 if (EVI->getAggregateOperand() != II)
288 return false;
289 }
290
291 CC = TmpCC;
292 return true;
293 }
294
295 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
296 EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
297 if (evt == MVT::Other || !evt.isSimple())
298 // Unhandled type. Halt "fast" selection and bail.
299 return false;
300
301 VT = evt.getSimpleVT();
302 // For now, require SSE/SSE2 for performing floating-point operations,
303 // since x87 requires additional work.
304 if (VT == MVT::f64 && !X86ScalarSSEf64)
305 return false;
306 if (VT == MVT::f32 && !X86ScalarSSEf32)
307 return false;
308 // Similarly, no f80 support yet.
309 if (VT == MVT::f80)
310 return false;
311 // We only handle legal types. For example, on x86-32 the instruction
312 // selector contains all of the 64-bit instructions from x86-64,
313 // under the assumption that i64 won't be used if the target doesn't
314 // support it.
315 return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
316 }
317
318 #include "X86GenCallingConv.inc"
319
320 /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
321 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
322 /// Return true and the result register by reference if it is possible.
323 bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
324 MachineMemOperand *MMO, unsigned &ResultReg) {
325 // Get opcode and regclass of the output for the given load instruction.
326 unsigned Opc = 0;
327 const TargetRegisterClass *RC = nullptr;
328 switch (VT.getSimpleVT().SimpleTy) {
329 default: return false;
330 case MVT::i1:
331 case MVT::i8:
332 Opc = X86::MOV8rm;
333 RC = &X86::GR8RegClass;
334 break;
335 case MVT::i16:
336 Opc = X86::MOV16rm;
337 RC = &X86::GR16RegClass;
338 break;
339 case MVT::i32:
340 Opc = X86::MOV32rm;
341 RC = &X86::GR32RegClass;
342 break;
343 case MVT::i64:
344 // Must be in x86-64 mode.
345 Opc = X86::MOV64rm;
346 RC = &X86::GR64RegClass;
347 break;
348 case MVT::f32:
349 if (X86ScalarSSEf32) {
350 Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
351 RC = &X86::FR32RegClass;
352 } else {
353 Opc = X86::LD_Fp32m;
354 RC = &X86::RFP32RegClass;
355 }
356 break;
357 case MVT::f64:
358 if (X86ScalarSSEf64) {
359 Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
360 RC = &X86::FR64RegClass;
361 } else {
362 Opc = X86::LD_Fp64m;
363 RC = &X86::RFP64RegClass;
364 }
365 break;
366 case MVT::f80:
367 // No f80 support yet.
368 return false;
369 }
370
371 ResultReg = createResultReg(RC);
372 MachineInstrBuilder MIB =
373 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
374 addFullAddress(MIB, AM);
375 if (MMO)
376 MIB->addMemOperand(*FuncInfo.MF, MMO);
377 return true;
378 }
379
380 /// X86FastEmitStore - Emit a machine instruction to store a value Val of
381 /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
382 /// and a displacement offset, or a GlobalAddress,
383 /// i.e. V. Return true if it is possible.
384 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
385 const X86AddressMode &AM,
386 MachineMemOperand *MMO, bool Aligned) {
387 // Get opcode and regclass of the output for the given store instruction.
388 unsigned Opc = 0;
389 switch (VT.getSimpleVT().SimpleTy) {
390 case MVT::f80: // No f80 support yet.
391 default: return false;
392 case MVT::i1: {
393 // Mask out all but lowest bit.
394 unsigned AndResult = createResultReg(&X86::GR8RegClass);
395 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
396 TII.get(X86::AND8ri), AndResult)
397 .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
398 ValReg = AndResult;
399 }
400 // FALLTHROUGH, handling i1 as i8.
401 case MVT::i8: Opc = X86::MOV8mr; break;
402 case MVT::i16: Opc = X86::MOV16mr; break;
403 case MVT::i32: Opc = X86::MOV32mr; break;
404 case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
405 case MVT::f32:
406 Opc = X86ScalarSSEf32 ?
407 (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
408 break;
409 case MVT::f64:
410 Opc = X86ScalarSSEf64 ?
411 (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
412 break;
413 case MVT::v4f32:
414 if (Aligned)
415 Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
416 else
417 Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
418 break;
419 case MVT::v2f64:
420 if (Aligned)
421 Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
422 else
423 Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
424 break;
425 case MVT::v4i32:
426 case MVT::v2i64:
427 case MVT::v8i16:
428 case MVT::v16i8:
429 if (Aligned)
430 Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
431 else
432 Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
433 break;
434 }
435
436 MachineInstrBuilder MIB =
437 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
438 addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
439 if (MMO)
440 MIB->addMemOperand(*FuncInfo.MF, MMO);
441
442 return true;
443 }
444
445 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
446 const X86AddressMode &AM,
447 MachineMemOperand *MMO, bool Aligned) {
448 // Handle 'null' like i32/i64 0.
449 if (isa(Val))
450 Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
451
452 // If this is a store of a simple constant, fold the constant into the store.
453 if (const ConstantInt *CI = dyn_cast(Val)) {
454 unsigned Opc = 0;
455 bool Signed = true;
456 switch (VT.getSimpleVT().SimpleTy) {
457 default: break;
458 case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8.
459 case MVT::i8: Opc = X86::MOV8mi; break;
460 case MVT::i16: Opc = X86::MOV16mi; break;
461 case MVT::i32: Opc = X86::MOV32mi; break;
462 case MVT::i64:
463 // Must be a 32-bit sign extended value.
464 if (isInt<32>(CI->getSExtValue()))
465 Opc = X86::MOV64mi32;
466 break;
467 }
468
469 if (Opc) {
470 MachineInstrBuilder MIB =
471 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
472 addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
473 : CI->getZExtValue());
474 if (MMO)
475 MIB->addMemOperand(*FuncInfo.MF, MMO);
476 return true;
477 }
478 }
479
480 unsigned ValReg = getRegForValue(Val);
481 if (ValReg == 0)
482 return false;
483
484 bool ValKill = hasTrivialKill(Val);
485 return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
486 }
487
488 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
489 /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
490 /// ISD::SIGN_EXTEND).
491 bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
492 unsigned Src, EVT SrcVT,
493 unsigned &ResultReg) {
494 unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
495 Src, /*TODO: Kill=*/false);
496 if (RR == 0)
497 return false;
498
499 ResultReg = RR;
500 return true;
501 }
502
503 bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
504 // Handle constant address.
505 if (const GlobalValue *GV = dyn_cast(V)) {
506 // Can't handle alternate code models yet.
507 if (TM.getCodeModel() != CodeModel::Small)
508 return false;
509
510 // Can't handle TLS yet.
511 if (GV->isThreadLocal())
512 return false;
513
514 // RIP-relative addresses can't have additional register operands, so if
515 // we've already folded stuff into the addressing mode, just force the
516 // global value into its own register, which we can use as the basereg.
517 if (!Subtarget->isPICStyleRIPRel() ||
518 (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
519 // Okay, we've committed to selecting this global. Set up the address.
520 AM.GV = GV;
521
522 // Allow the subtarget to classify the global.
523 unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
524
525 // If this reference is relative to the pic base, set it now.
526 if (isGlobalRelativeToPICBase(GVFlags)) {
527 // FIXME: How do we know Base.Reg is free??
528 AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
529 }
530
531 // Unless the ABI requires an extra load, return a direct reference to
532 // the global.
533 if (!isGlobalStubReference(GVFlags)) {
534 if (Subtarget->isPICStyleRIPRel()) {
535 // Use rip-relative addressing if we can. Above we verified that the
536 // base and index registers are unused.
537 assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
538 AM.Base.Reg = X86::RIP;
539 }
540 AM.GVOpFlags = GVFlags;
541 return true;
542 }
543
544 // Ok, we need to do a load from a stub. If we've already loaded from
545 // this stub, reuse the loaded pointer, otherwise emit the load now.
546 DenseMap::iterator I = LocalValueMap.find(V);
547 unsigned LoadReg;
548 if (I != LocalValueMap.end() && I->second != 0) {
549 LoadReg = I->second;
550 } else {
551 // Issue load from stub.
552 unsigned Opc = 0;
553 const TargetRegisterClass *RC = nullptr;
554 X86AddressMode StubAM;
555 StubAM.Base.Reg = AM.Base.Reg;
556 StubAM.GV = GV;
557 StubAM.GVOpFlags = GVFlags;
558
559 // Prepare for inserting code in the local-value area.
560 SavePoint SaveInsertPt = enterLocalValueArea();
561
562 if (TLI.getPointerTy() == MVT::i64) {
563 Opc = X86::MOV64rm;
564 RC = &X86::GR64RegClass;
565
566 if (Subtarget->isPICStyleRIPRel())
567 StubAM.Base.Reg = X86::RIP;
568 } else {
569 Opc = X86::MOV32rm;
570 RC = &X86::GR32RegClass;
571 }
572
573 LoadReg = createResultReg(RC);
574 MachineInstrBuilder LoadMI =
575 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
576 addFullAddress(LoadMI, StubAM);
577
578 // Ok, back to normal mode.
579 leaveLocalValueArea(SaveInsertPt);
580
581 // Prevent loading GV stub multiple times in same MBB.
582 LocalValueMap[V] = LoadReg;
583 }
584
585 // Now construct the final address. Note that the Disp, Scale,
586 // and Index values may already be set here.
587 AM.Base.Reg = LoadReg;
588 AM.GV = nullptr;
589 return true;
590 }
591 }
592
593 // If all else fails, try to materialize the value in a register.
594 if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
595 if (AM.Base.Reg == 0) {
596 AM.Base.Reg = getRegForValue(V);
597 return AM.Base.Reg != 0;
598 }
599 if (AM.IndexReg == 0) {
600 assert(AM.Scale == 1 && "Scale with no index!");
601 AM.IndexReg = getRegForValue(V);
602 return AM.IndexReg != 0;
603 }
604 }
605
606 return false;
607 }
608
609 /// X86SelectAddress - Attempt to fill in an address from the given value.
610 ///
611 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
612 SmallVector GEPs;
613 redo_gep:
614 const User *U = nullptr;
615 unsigned Opcode = Instruction::UserOp1;
616 if (const Instruction *I = dyn_cast(V)) {
617 // Don't walk into other basic blocks; it's possible we haven't
618 // visited them yet, so the instructions may not yet be assigned
619 // virtual registers.
620 if (FuncInfo.StaticAllocaMap.count(static_cast(V)) ||
621 FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
622 Opcode = I->getOpcode();
623 U = I;
624 }
625 } else if (const ConstantExpr *C = dyn_cast(V)) {
626 Opcode = C->getOpcode();
627 U = C;
628 }
629
630 if (PointerType *Ty = dyn_cast(V->getType()))
631 if (Ty->getAddressSpace() > 255)
632 // Fast instruction selection doesn't support the special
633 // address spaces.
634 return false;
635
636 switch (Opcode) {
637 default: break;
638 case Instruction::BitCast:
639 // Look past bitcasts.
640 return X86SelectAddress(U->getOperand(0), AM);
641
642 case Instruction::IntToPtr:
643 // Look past no-op inttoptrs.
644 if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
645 return X86SelectAddress(U->getOperand(0), AM);
646 break;
647
648 case Instruction::PtrToInt:
649 // Look past no-op ptrtoints.
650 if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
651 return X86SelectAddress(U->getOperand(0), AM);
652 break;
653
654 case Instruction::Alloca: {
655 // Do static allocas.
656 const AllocaInst *A = cast(V);
657 DenseMap::iterator SI =
658 FuncInfo.StaticAllocaMap.find(A);
659 if (SI != FuncInfo.StaticAllocaMap.end()) {
660 AM.BaseType = X86AddressMode::FrameIndexBase;
661 AM.Base.FrameIndex = SI->second;
662 return true;
663 }
664 break;
665 }
666
667 case Instruction::Add: {
668 // Adds of constants are common and easy enough.
669 if (const ConstantInt *CI = dyn_cast(U->getOperand(1))) {
670 uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
671 // They have to fit in the 32-bit signed displacement field though.
672 if (isInt<32>(Disp)) {
673 AM.Disp = (uint32_t)Disp;
674 return X86SelectAddress(U->getOperand(0), AM);
675 }
676 }
677 break;
678 }
679
680 case Instruction::GetElementPtr: {
681 X86AddressMode SavedAM = AM;
682
683 // Pattern-match simple GEPs.
684 uint64_t Disp = (int32_t)AM.Disp;
685 unsigned IndexReg = AM.IndexReg;
686 unsigned Scale = AM.Scale;
687 gep_type_iterator GTI = gep_type_begin(U);
688 // Iterate through the indices, folding what we can. Constants can be
689 // folded, and one dynamic index can be handled, if the scale is supported.
690 for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
691 i != e; ++i, ++GTI) {
692 const Value *Op = *i;
693 if (StructType *STy = dyn_cast(*GTI)) {
694 const StructLayout *SL = DL.getStructLayout(STy);
695 Disp += SL->getElementOffset(cast(Op)->getZExtValue());
696 continue;
697 }
698
699 // A array/variable index is always of the form i*S where S is the
700 // constant scale size. See if we can push the scale into immediates.
701 uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
702 for (;;) {
703 if (const ConstantInt *CI = dyn_cast(Op)) {
704 // Constant-offset addressing.
705 Disp += CI->getSExtValue() * S;
706 break;
707 }
708 if (canFoldAddIntoGEP(U, Op)) {
709 // A compatible add with a constant operand. Fold the constant.
710 ConstantInt *CI =
711 cast(cast(Op)->getOperand(1));
712 Disp += CI->getSExtValue() * S;
713 // Iterate on the other operand.
714 Op = cast(Op)->getOperand(0);
715 continue;
716 }
717 if (IndexReg == 0 &&
718 (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
719 (S == 1 || S == 2 || S == 4 || S == 8)) {
720 // Scaled-index addressing.
721 Scale = S;
722 IndexReg = getRegForGEPIndex(Op).first;
723 if (IndexReg == 0)
724 return false;
725 break;
726 }
727 // Unsupported.
728 goto unsupported_gep;
729 }
730 }
731
732 // Check for displacement overflow.
733 if (!isInt<32>(Disp))
734 break;
735
736 AM.IndexReg = IndexReg;
737 AM.Scale = Scale;
738 AM.Disp = (uint32_t)Disp;
739 GEPs.push_back(V);
740
741 if (const GetElementPtrInst *GEP =
742 dyn_cast(U->getOperand(0))) {
743 // Ok, the GEP indices were covered by constant-offset and scaled-index
744 // addressing. Update the address state and move on to examining the base.
745 V = GEP;
746 goto redo_gep;
747 } else if (X86SelectAddress(U->getOperand(0), AM)) {
748 return true;
749 }
750
751 // If we couldn't merge the gep value into this addr mode, revert back to
752 // our address and just match the value instead of completely failing.
753 AM = SavedAM;
754
755 for (SmallVectorImpl::reverse_iterator
756 I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
757 if (handleConstantAddresses(*I, AM))
758 return true;
759
760 return false;
761 unsupported_gep:
762 // Ok, the GEP indices weren't all covered.
763 break;
764 }
765 }
766
767 return handleConstantAddresses(V, AM);
768 }
769
770 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
771 ///
772 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
773 const User *U = nullptr;
774 unsigned Opcode = Instruction::UserOp1;
775 const Instruction *I = dyn_cast(V);
776 // Record if the value is defined in the same basic block.
777 //
778 // This information is crucial to know whether or not folding an
779 // operand is valid.
780 // Indeed, FastISel generates or reuses a virtual register for all
781 // operands of all instructions it selects. Obviously, the definition and
782 // its uses must use the same virtual register otherwise the produced
783 // code is incorrect.
784 // Before instruction selection, FunctionLoweringInfo::set sets the virtual
785 // registers for values that are alive across basic blocks. This ensures
786 // that the values are consistently set between across basic block, even
787 // if different instruction selection mechanisms are used (e.g., a mix of
788 // SDISel and FastISel).
789 // For values local to a basic block, the instruction selection process
790 // generates these virtual registers with whatever method is appropriate
791 // for its needs. In particular, FastISel and SDISel do not share the way
792 // local virtual registers are set.
793 // Therefore, this is impossible (or at least unsafe) to share values
794 // between basic blocks unless they use the same instruction selection
795 // method, which is not guarantee for X86.
796 // Moreover, things like hasOneUse could not be used accurately, if we
797 // allow to reference values across basic blocks whereas they are not
798 // alive across basic blocks initially.
799 bool InMBB = true;
800 if (I) {
801 Opcode = I->getOpcode();
802 U = I;
803 InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
804 } else if (const ConstantExpr *C = dyn_cast(V)) {
805 Opcode = C->getOpcode();
806 U = C;
807 }
808
809 switch (Opcode) {
810 default: break;
811 case Instruction::BitCast:
812 // Look past bitcasts if its operand is in the same BB.
813 if (InMBB)
814 return X86SelectCallAddress(U->getOperand(0), AM);
815 break;
816
817 case Instruction::IntToPtr:
818 // Look past no-op inttoptrs if its operand is in the same BB.
819 if (InMBB &&
820 TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
821 return X86SelectCallAddress(U->getOperand(0), AM);
822 break;
823
824 case Instruction::PtrToInt:
825 // Look past no-op ptrtoints if its operand is in the same BB.
826 if (InMBB &&
827 TLI.getValueType(U->getType()) == TLI.getPointerTy())
828 return X86SelectCallAddress(U->getOperand(0), AM);
829 break;
830 }
831
832 // Handle constant address.
833 if (const GlobalValue *GV = dyn_cast(V)) {
834 // Can't handle alternate code models yet.
835 if (TM.getCodeModel() != CodeModel::Small)
836 return false;
837
838 // RIP-relative addresses can't have additional register operands.
839 if (Subtarget->isPICStyleRIPRel() &&
840 (AM.Base.Reg != 0 || AM.IndexReg != 0))
841 return false;
842
843 // Can't handle DLL Import.
844 if (GV->hasDLLImportStorageClass())
845 return false;
846
847 // Can't handle TLS.
848 if (const GlobalVariable *GVar = dyn_cast(GV))
849 if (GVar->isThreadLocal())
850 return false;
851
852 // Okay, we've committed to selecting this global. Set up the basic address.
853 AM.GV = GV;
854
855 // No ABI requires an extra load for anything other than DLLImport, which
856 // we rejected above. Return a direct reference to the global.
857 if (Subtarget->isPICStyleRIPRel()) {
858 // Use rip-relative addressing if we can. Above we verified that the
859 // base and index registers are unused.
860 assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
861 AM.Base.Reg = X86::RIP;
862 } else if (Subtarget->isPICStyleStubPIC()) {
863 AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
864 } else if (Subtarget->isPICStyleGOT()) {
865 AM.GVOpFlags = X86II::MO_GOTOFF;
866 }
867
868 return true;
869 }
870
871 // If all else fails, try to materialize the value in a register.
872 if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
873 if (AM.Base.Reg == 0) {
874 AM.Base.Reg = getRegForValue(V);
875 return AM.Base.Reg != 0;
876 }
877 if (AM.IndexReg == 0) {
878 assert(AM.Scale == 1 && "Scale with no index!");
879 AM.IndexReg = getRegForValue(V);
880 return AM.IndexReg != 0;
881 }
882 }
883
884 return false;
885 }
886
887
888 /// X86SelectStore - Select and emit code to implement store instructions.
889 bool X86FastISel::X86SelectStore(const Instruction *I) {
890 // Atomic stores need special handling.
891 const StoreInst *S = cast(I);
892
893 if (S->isAtomic())
894 return false;
895
896 const Value *Val = S->getValueOperand();
897 const Value *Ptr = S->getPointerOperand();
898
899 MVT VT;
900 if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
901 return false;
902
903 unsigned Alignment = S->getAlignment();
904 unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
905 if (Alignment == 0) // Ensure that codegen never sees alignment 0
906 Alignment = ABIAlignment;
907 bool Aligned = Alignment >= ABIAlignment;
908
909 X86AddressMode AM;
910 if (!X86SelectAddress(Ptr, AM))
911 return false;
912
913 return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
914 }
915
916 /// X86SelectRet - Select and emit code to implement ret instructions.
917 bool X86FastISel::X86SelectRet(const Instruction *I) {
918 const ReturnInst *Ret = cast(I);
919 const Function &F = *I->getParent()->getParent();
920 const X86MachineFunctionInfo *X86MFInfo =
921 FuncInfo.MF->getInfo();
922
923 if (!FuncInfo.CanLowerReturn)
924 return false;
925
926 CallingConv::ID CC = F.getCallingConv();
927 if (CC != CallingConv::C &&
928 CC != CallingConv::Fast &&
929 CC != CallingConv::X86_FastCall &&
930 CC != CallingConv::X86_64_SysV)
931 return false;
932
933 if (Subtarget->isCallingConvWin64(CC))
934 return false;
935
936 // Don't handle popping bytes on return for now.
937 if (X86MFInfo->getBytesToPopOnReturn() != 0)
938 return false;
939
940 // fastcc with -tailcallopt is intended to provide a guaranteed
941 // tail call optimization. Fastisel doesn't know how to do that.
942 if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
943 return false;
944
945 // Let SDISel handle vararg functions.
946 if (F.isVarArg())
947 return false;
948
949 // Build a list of return value registers.
950 SmallVector RetRegs;
951
952 if (Ret->getNumOperands() > 0) {
953 SmallVector Outs;
954 GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
955
956 // Analyze operands of the call, assigning locations to each operand.
957 SmallVector ValLocs;
958 CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
959 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
960
961 const Value *RV = Ret->getOperand(0);
962 unsigned Reg = getRegForValue(RV);
963 if (Reg == 0)
964 return false;
965
966 // Only handle a single return value for now.
967 if (ValLocs.size() != 1)
968 return false;
969
970 CCValAssign &VA = ValLocs[0];
971
972 // Don't bother handling odd stuff for now.
973 if (VA.getLocInfo() != CCValAssign::Full)
974 return false;
975 // Only handle register returns for now.
976 if (!VA.isRegLoc())
977 return false;
978
979 // The calling-convention tables for x87 returns don't tell
980 // the whole story.
981 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
982 return false;
983
984 unsigned SrcReg = Reg + VA.getValNo();
985 EVT SrcVT = TLI.getValueType(RV->getType());
986 EVT DstVT = VA.getValVT();
987 // Special handling for extended integers.
988 if (SrcVT != DstVT) {
989 if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
990 return false;
991
992 if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
993 return false;
994
995 assert(DstVT == MVT::i32 && "X86 should always ext to i32");
996
997 if (SrcVT == MVT::i1) {
998 if (Outs[0].Flags.isSExt())
999 return false;
1000 SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
1001 SrcVT = MVT::i8;
1002 }
1003 unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
1004 ISD::SIGN_EXTEND;
1005 SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
1006 SrcReg, /*TODO: Kill=*/false);
1007 }
1008
1009 // Make the copy.
1010 unsigned DstReg = VA.getLocReg();
1011 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
1012 // Avoid a cross-class copy. This is very unlikely.
1013 if (!SrcRC->contains(DstReg))
1014 return false;
1015 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1016 TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
1017
1018 // Add register to return instruction.
1019 RetRegs.push_back(VA.getLocReg());
1020 }
1021
1022 // The x86-64 ABI for returning structs by value requires that we copy
1023 // the sret argument into %rax for the return. We saved the argument into
1024 // a virtual register in the entry block, so now we copy the value out
1025 // and into %rax. We also do the same with %eax for Win32.
1026 if (F.hasStructRetAttr() &&
1027 (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
1028 unsigned Reg = X86MFInfo->getSRetReturnReg();
1029 assert(Reg &&
1030 "SRetReturnReg should have been set in LowerFormalArguments()!");
1031 unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
1032 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1033 TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
1034 RetRegs.push_back(RetReg);
1035 }
1036
1037 // Now emit the RET.
1038 MachineInstrBuilder MIB =
1039 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1040 TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
1041 for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
1042 MIB.addReg(RetRegs[i], RegState::Implicit);
1043 return true;
1044 }
1045
1046 /// X86SelectLoad - Select and emit code to implement load instructions.
1047 ///
1048 bool X86FastISel::X86SelectLoad(const Instruction *I) {
1049 const LoadInst *LI = cast(I);
1050
1051 // Atomic loads need special handling.
1052 if (LI->isAtomic())
1053 return false;
1054
1055 MVT VT;
1056 if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
1057 return false;
1058
1059 const Value *Ptr = LI->getPointerOperand();
1060
1061 X86AddressMode AM;
1062 if (!X86SelectAddress(Ptr, AM))
1063 return false;
1064
1065 unsigned ResultReg = 0;
1066 if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
1067 return false;
1068
1069 updateValueMap(I, ResultReg);
1070 return true;
1071 }
1072
1073 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
1074 bool HasAVX = Subtarget->hasAVX();
1075 bool X86ScalarSSEf32 = Subtarget->hasSSE1();
1076 bool X86ScalarSSEf64 = Subtarget->hasSSE2();
1077
1078 switch (VT.getSimpleVT().SimpleTy) {
1079 default: return 0;
1080 case MVT::i8: return X86::CMP8rr;
1081 case MVT::i16: return X86::CMP16rr;
1082 case MVT::i32: return X86::CMP32rr;
1083 case MVT::i64: return X86::CMP64rr;
1084 case MVT::f32:
1085 return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
1086 case MVT::f64:
1087 return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
1088 }
1089 }
1090
1091 /// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
1092 /// of the comparison, return an opcode that works for the compare (e.g.
1093 /// CMP32ri) otherwise return 0.
1094 static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
1095 switch (VT.getSimpleVT().SimpleTy) {
1096 // Otherwise, we can't fold the immediate into this comparison.
1097 default: return 0;
1098 case MVT::i8: return X86::CMP8ri;
1099 case MVT::i16: return X86::CMP16ri;
1100 case MVT::i32: return X86::CMP32ri;
1101 case MVT::i64:
1102 // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
1103 // field.
1104 if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
1105 return X86::CMP64ri32;
1106 return 0;
1107 }
1108 }
1109
1110 bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
1111 EVT VT, DebugLoc CurDbgLoc) {
1112 unsigned Op0Reg = getRegForValue(Op0);
1113 if (Op0Reg == 0) return false;
1114
1115 // Handle 'null' like i32/i64 0.
1116 if (isa(Op1))
1117 Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
1118
1119 // We have two options: compare with register or immediate. If the RHS of
1120 // the compare is an immediate that we can fold into this compare, use
1121 // CMPri, otherwise use CMPrr.
1122 if (const ConstantInt *Op1C = dyn_cast(Op1)) {
1123 if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
1124 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareImmOpc))
1125 .addReg(Op0Reg)
1126 .addImm(Op1C->getSExtValue());
1127 return true;
1128 }
1129 }
1130
1131 unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
1132 if (CompareOpc == 0) return false;
1133
1134 unsigned Op1Reg = getRegForValue(Op1);
1135 if (Op1Reg == 0) return false;
1136 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
1137 .addReg(Op0Reg)
1138 .addReg(Op1Reg);
1139
1140 return true;
1141 }
1142
1143 bool X86FastISel::X86SelectCmp(const Instruction *I) {
1144 const CmpInst *CI = cast(I);
1145
1146 MVT VT;
1147 if (!isTypeLegal(I->getOperand(0)->getType(), VT))
1148 return false;
1149
1150 // Try to optimize or fold the cmp.
1151 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1152 unsigned ResultReg = 0;
1153 switch (Predicate) {
1154 default: break;
1155 case CmpInst::FCMP_FALSE: {
1156 ResultReg = createResultReg(&X86::GR32RegClass);
1157 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
1158 ResultReg);
1159 ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
1160 X86::sub_8bit);
1161 if (!ResultReg)
1162 return false;
1163 break;
1164 }
1165 case CmpInst::FCMP_TRUE: {
1166 ResultReg = createResultReg(&X86::GR8RegClass);
1167 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
1168 ResultReg).addImm(1);
1169 break;
1170 }
1171 }
1172
1173 if (ResultReg) {
1174 updateValueMap(I, ResultReg);
1175 return true;
1176 }
1177
1178 const Value *LHS = CI->getOperand(0);
1179 const Value *RHS = CI->getOperand(1);
1180
1181 // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
1182 // We don't have to materialize a zero constant for this case and can just use
1183 // %x again on the RHS.
1184 if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1185 const auto *RHSC = dyn_cast(RHS);
1186 if (RHSC && RHSC->isNullValue())
1187 RHS = LHS;
1188 }
1189
1190 // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
1191 static unsigned SETFOpcTable[2][3] = {
1192 { X86::SETEr, X86::SETNPr, X86::AND8rr },
1193 { X86::SETNEr, X86::SETPr, X86::OR8rr }
1194 };
1195 unsigned *SETFOpc = nullptr;
1196 switch (Predicate) {
1197 default: break;
1198 case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
1199 case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
1200 }
1201
1202 ResultReg = createResultReg(&X86::GR8RegClass);
1203 if (SETFOpc) {
1204 if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
1205 return false;
1206
1207 unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
1208 unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
1209 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
1210 FlagReg1);
1211 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
1212 FlagReg2);
1213 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
1214 ResultReg).addReg(FlagReg1).addReg(FlagReg2);
1215 updateValueMap(I, ResultReg);
1216 return true;
1217 }
1218
1219 X86::CondCode CC;
1220 bool SwapArgs;
1221 std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
1222 assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1223 unsigned Opc = X86::getSETFromCond(CC);
1224
1225 if (SwapArgs)
1226 std::swap(LHS, RHS);
1227
1228 // Emit a compare of LHS/RHS.
1229 if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
1230 return false;
1231
1232 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
1233 updateValueMap(I, ResultReg);
1234 return true;
1235 }
1236
1237 bool X86FastISel::X86SelectZExt(const Instruction *I) {
1238 EVT DstVT = TLI.getValueType(I->getType());
1239 if (!TLI.isTypeLegal(DstVT))
1240 return false;
1241
1242 unsigned ResultReg = getRegForValue(I->getOperand(0));
1243 if (ResultReg == 0)
1244 return false;
1245
1246 // Handle zero-extension from i1 to i8, which is common.
1247 MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
1248 if (SrcVT.SimpleTy == MVT::i1) {
1249 // Set the high bits to zero.
1250 ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
1251 SrcVT = MVT::i8;
1252
1253 if (ResultReg == 0)
1254 return false;
1255 }
1256
1257 if (DstVT == MVT::i64) {
1258 // Handle extension to 64-bits via sub-register shenanigans.
1259 unsigned MovInst;
1260
1261 switch (SrcVT.SimpleTy) {
1262 case MVT::i8: MovInst = X86::MOVZX32rr8; break;
1263 case MVT::i16: MovInst = X86::MOVZX32rr16; break;
1264 case MVT::i32: MovInst = X86::MOV32rr; break;
1265 default: llvm_unreachable("Unexpected zext to i64 source type");
1266 }
1267
1268 unsigned Result32 = createResultReg(&X86::GR32RegClass);
1269 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
1270 .addReg(ResultReg);
1271
1272 ResultReg = createResultReg(&X86::GR64RegClass);
1273 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
1274 ResultReg)
1275 .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
1276 } else if (DstVT != MVT::i8) {
1277 ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
1278 ResultReg, /*Kill=*/true);
1279 if (ResultReg == 0)
1280 return false;
1281 }
1282
1283 updateValueMap(I, ResultReg);
1284 return true;
1285 }
1286
1287 bool X86FastISel::X86SelectBranch(const Instruction *I) {
1288 // Unconditional branches are selected by tablegen-generated code.
1289 // Handle a conditional branch.
1290 const BranchInst *BI = cast(I);
1291 MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
1292 MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
1293
1294 // Fold the common case of a conditional branch with a comparison
1295 // in the same block (values defined on other blocks may not have
1296 // initialized registers).
1297 X86::CondCode CC;
1298 if (const CmpInst *CI = dyn_cast(BI->getCondition())) {
1299 if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
1300 EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
1301
1302 // Try to optimize or fold the cmp.
1303 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1304 switch (Predicate) {
1305 default: break;
1306 case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
1307 case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true;
1308 }
1309
1310 const Value *CmpLHS = CI->getOperand(0);
1311 const Value *CmpRHS = CI->getOperand(1);
1312
1313 // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
1314 // 0.0.
1315 // We don't have to materialize a zero constant for this case and can just
1316 // use %x again on the RHS.
1317 if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1318 const auto *CmpRHSC = dyn_cast(CmpRHS);
1319 if (CmpRHSC && CmpRHSC->isNullValue())
1320 CmpRHS = CmpLHS;
1321 }
1322
1323 // Try to take advantage of fallthrough opportunities.
1324 if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
1325 std::swap(TrueMBB, FalseMBB);
1326 Predicate = CmpInst::getInversePredicate(Predicate);
1327 }
1328
1329 // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
1330 // code check. Instead two branch instructions are required to check all
1331 // the flags. First we change the predicate to a supported condition code,
1332 // which will be the first branch. Later one we will emit the second
1333 // branch.
1334 bool NeedExtraBranch = false;
1335 switch (Predicate) {
1336 default: break;
1337 case CmpInst::FCMP_OEQ:
1338 std::swap(TrueMBB, FalseMBB); // fall-through
1339 case CmpInst::FCMP_UNE:
1340 NeedExtraBranch = true;
1341 Predicate = CmpInst::FCMP_ONE;
1342 break;
1343 }
1344
1345 bool SwapArgs;
1346 unsigned BranchOpc;
1347 std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
1348 assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1349
1350 BranchOpc = X86::GetCondBranchFromCond(CC);
1351 if (SwapArgs)
1352 std::swap(CmpLHS, CmpRHS);
1353
1354 // Emit a compare of the LHS and RHS, setting the flags.
1355 if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT, CI->getDebugLoc()))
1356 return false;
1357
1358 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
1359 .addMBB(TrueMBB);
1360
1361 // X86 requires a second branch to handle UNE (and OEQ, which is mapped
1362 // to UNE above).
1363 if (NeedExtraBranch) {
1364 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
1365 .addMBB(TrueMBB);
1366 }
1367
1368 // Obtain the branch weight and add the TrueBB to the successor list.
1369 uint32_t BranchWeight = 0;
1370 if (FuncInfo.BPI)
1371 BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
1372 TrueMBB->getBasicBlock());
1373 FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
1374
1375 // Emits an unconditional branch to the FalseBB, obtains the branch
1376 // weight, and adds it to the successor list.
1377 fastEmitBranch(FalseMBB, DbgLoc);
1378
1379 return true;
1380 }
1381 } else if (TruncInst *TI = dyn_cast(BI->getCondition())) {
1382 // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
1383 // typically happen for _Bool and C++ bools.
1384 MVT SourceVT;
1385 if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
1386 isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
1387 unsigned TestOpc = 0;
1388 switch (SourceVT.SimpleTy) {
1389 default: break;
1390 case MVT::i8: TestOpc = X86::TEST8ri; break;
1391 case MVT::i16: TestOpc = X86::TEST16ri; break;
1392 case MVT::i32: TestOpc = X86::TEST32ri; break;
1393 case MVT::i64: TestOpc = X86::TEST64ri32; break;
1394 }
1395 if (TestOpc) {
1396 unsigned OpReg = getRegForValue(TI->getOperand(0));
1397 if (OpReg == 0) return false;
1398 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
1399 .addReg(OpReg).addImm(1);
1400
1401 unsigned JmpOpc = X86::JNE_1;
1402 if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
1403 std::swap(TrueMBB, FalseMBB);
1404 JmpOpc = X86::JE_1;
1405 }
1406
1407 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
1408 .addMBB(TrueMBB);
1409 fastEmitBranch(FalseMBB, DbgLoc);
1410 uint32_t BranchWeight = 0;
1411 if (FuncInfo.BPI)
1412 BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
1413 TrueMBB->getBasicBlock());
1414 FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
1415 return true;
1416 }
1417 }
1418 } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
1419 // Fake request the condition, otherwise the intrinsic might be completely
1420 // optimized away.
1421 unsigned TmpReg = getRegForValue(BI->getCondition());
1422 if (TmpReg == 0)
1423 return false;
1424
1425 unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
1426
1427 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
1428 .addMBB(TrueMBB);
1429 fastEmitBranch(FalseMBB, DbgLoc);
1430 uint32_t BranchWeight = 0;
1431 if (FuncInfo.BPI)
1432 BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
1433 TrueMBB->getBasicBlock());
1434 FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
1435 return true;
1436 }
1437
1438 // Otherwise do a clumsy setcc and re-test it.
1439 // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
1440 // in an explicit cast, so make sure to handle that correctly.
1441 unsigned OpReg = getRegForValue(BI->getCondition());
1442 if (OpReg == 0) return false;
1443
1444 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
1445 .addReg(OpReg).addImm(1);
1446 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
1447 .addMBB(TrueMBB);
1448 fastEmitBranch(FalseMBB, DbgLoc);
1449 uint32_t BranchWeight = 0;
1450 if (FuncInfo.BPI)
1451 BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
1452 TrueMBB->getBasicBlock());
1453 FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
1454 return true;
1455 }
1456
1457 bool X86FastISel::X86SelectShift(const Instruction *I) {
1458 unsigned CReg = 0, OpReg = 0;
1459 const TargetRegisterClass *RC = nullptr;
1460 if (I->getType()->isIntegerTy(8)) {
1461 CReg = X86::CL;
1462 RC = &X86::GR8RegClass;
1463 switch (I->getOpcode()) {
1464 case Instruction::LShr: OpReg = X86::SHR8rCL; break;
1465 case Instruction::AShr: OpReg = X86::SAR8rCL; break;
1466 case Instruction::Shl: OpReg = X86::SHL8rCL; break;
1467 default: return false;
1468 }
1469 } else if (I->getType()->isIntegerTy(16)) {
1470 CReg = X86::CX;
1471 RC = &X86::GR16RegClass;
1472 switch (I->getOpcode()) {
1473 case Instruction::LShr: OpReg = X86::SHR16rCL; break;
1474 case Instruction::AShr: OpReg = X86::SAR16rCL; break;
1475 case Instruction::Shl: OpReg = X86::SHL16rCL; break;
1476 default: return false;
1477 }
1478 } else if (I->getType()->isIntegerTy(32)) {
1479 CReg = X86::ECX;
1480 RC = &X86::GR32RegClass;
1481 switch (I->getOpcode()) {
1482 case Instruction::LShr: OpReg = X86::SHR32rCL; break;
1483 case Instruction::AShr: OpReg = X86::SAR32rCL; break;
1484 case Instruction::Shl: OpReg = X86::SHL32rCL; break;
1485 default: return false;
1486 }
1487 } else if (I->getType()->isIntegerTy(64)) {
1488 CReg = X86::RCX;
1489 RC = &X86::GR64RegClass;
1490 switch (I->getOpcode()) {
1491 case Instruction::LShr: OpReg = X86::SHR64rCL; break;
1492 case Instruction::AShr: OpReg = X86::SAR64rCL; break;
1493 case Instruction::Shl: OpReg = X86::SHL64rCL; break;
1494 default: return false;
1495 }
1496 } else {
1497 return false;
1498 }
1499
1500 MVT VT;
1501 if (!isTypeLegal(I->getType(), VT))
1502 return false;
1503
1504 unsigned Op0Reg = getRegForValue(I->getOperand(0));
1505 if (Op0Reg == 0) return false;
1506
1507 unsigned Op1Reg = getRegForValue(I->getOperand(1));
1508 if (Op1Reg == 0) return false;
1509 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
1510 CReg).addReg(Op1Reg);
1511
1512 // The shift instruction uses X86::CL. If we defined a super-register
1513 // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
1514 if (CReg != X86::CL)
1515 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1516 TII.get(TargetOpcode::KILL), X86::CL)
1517 .addReg(CReg, RegState::Kill);
1518
1519 unsigned ResultReg = createResultReg(RC);
1520 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
1521 .addReg(Op0Reg);
1522 updateValueMap(I, ResultReg);
1523 return true;
1524 }
1525
1526 bool X86FastISel::X86SelectDivRem(const Instruction *I) {
1527 const static unsigned NumTypes = 4; // i8, i16, i32, i64
1528 const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem
1529 const static bool S = true; // IsSigned
1530 const static bool U = false; // !IsSigned
1531 const static unsigned Copy = TargetOpcode::COPY;
1532 // For the X86 DIV/IDIV instruction, in most cases the dividend
1533 // (numerator) must be in a specific register pair highreg:lowreg,
1534 // producing the quotient in lowreg and the remainder in highreg.
1535 // For most data types, to set up the instruction, the dividend is
1536 // copied into lowreg, and lowreg is sign-extended or zero-extended
1537 // into highreg. The exception is i8, where the dividend is defined
1538 // as a single register rather than a register pair, and we
1539 // therefore directly sign-extend or zero-extend the dividend into
1540 // lowreg, instead of copying, and ignore the highreg.
1541 const static struct DivRemEntry {
1542 // The following portion depends only on the data type.
1543 const TargetRegisterClass *RC;
1544 unsigned LowInReg; // low part of the register pair
1545 unsigned HighInReg; // high part of the register pair
1546 // The following portion depends on both the data type and the operation.
1547 struct DivRemResult {
1548 unsigned OpDivRem; // The specific DIV/IDIV opcode to use.
1549 unsigned OpSignExtend; // Opcode for sign-extending lowreg into
1550 // highreg, or copying a zero into highreg.
1551 unsigned OpCopy; // Opcode for copying dividend into lowreg, or
1552 // zero/sign-extending into lowreg for i8.
1553 unsigned DivRemResultReg; // Register containing the desired result.
1554 bool IsOpSigned; // Whether to use signed or unsigned form.
1555 } ResultTable[NumOps];
1556 } OpTable[NumTypes] = {
1557 { &X86::GR8RegClass, X86::AX, 0, {
1558 { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv
1559 { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem
1560 { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv
1561 { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem
1562 }
1563 }, // i8
1564 { &X86::GR16RegClass, X86::AX, X86::DX, {
1565 { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv
1566 { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem
1567 { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv
1568 { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem
1569 }
1570 }, // i16
1571 { &X86::GR32RegClass, X86::EAX, X86::EDX, {
1572 { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv
1573 { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem
1574 { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv
1575 { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem
1576 }
1577 }, // i32
1578 { &X86::GR64RegClass, X86::RAX, X86::RDX, {
1579 { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv
1580 { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem
1581 { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv
1582 { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem
1583 }
1584 }, // i64
1585 };
1586
1587 MVT VT;
1588 if (!isTypeLegal(I->getType(), VT))
1589 return false;
1590
1591 unsigned TypeIndex, OpIndex;
1592 switch (VT.SimpleTy) {
1593 default: return false;
1594 case MVT::i8: TypeIndex = 0; break;
1595 case MVT::i16: TypeIndex = 1; break;
1596 case MVT::i32: TypeIndex = 2; break;
1597 case MVT::i64: TypeIndex = 3;
1598 if (!Subtarget->is64Bit())
1599 return false;
1600 break;
1601 }
1602
1603 switch (I->getOpcode()) {
1604 default: llvm_unreachable("Unexpected div/rem opcode");
1605 case Instruction::SDiv: OpIndex = 0; break;
1606 case Instruction::SRem: OpIndex = 1; break;
1607 case Instruction::UDiv: OpIndex = 2; break;
1608 case Instruction::URem: OpIndex = 3; break;
1609 }
1610
1611 const DivRemEntry &TypeEntry = OpTable[TypeIndex];
1612 const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
1613 unsigned Op0Reg = getRegForValue(I->getOperand(0));
1614 if (Op0Reg == 0)
1615 return false;
1616 unsigned Op1Reg = getRegForValue(I->getOperand(1));
1617 if (Op1Reg == 0)
1618 return false;
1619
1620 // Move op0 into low-order input register.
1621 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1622 TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
1623 // Zero-extend or sign-extend into high-order input register.
1624 if (OpEntry.OpSignExtend) {
1625 if (OpEntry.IsOpSigned)
1626 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1627 TII.get(OpEntry.OpSignExtend));
1628 else {
1629 unsigned Zero32 = createResultReg(&X86::GR32RegClass);
1630 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1631 TII.get(X86::MOV32r0), Zero32);
1632
1633 // Copy the zero into the appropriate sub/super/identical physical
1634 // register. Unfortunately the operations needed are not uniform enough
1635 // to fit neatly into the table above.
1636 if (VT.SimpleTy == MVT::i16) {
1637 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1638 TII.get(Copy), TypeEntry.HighInReg)
1639 .addReg(Zero32, 0, X86::sub_16bit);
1640 } else if (VT.SimpleTy == MVT::i32) {
1641 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1642 TII.get(Copy), TypeEntry.HighInReg)
1643 .addReg(Zero32);
1644 } else if (VT.SimpleTy == MVT::i64) {
1645 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1646 TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
1647 .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
1648 }
1649 }
1650 }
1651 // Generate the DIV/IDIV instruction.
1652 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1653 TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
1654 // For i8 remainder, we can't reference AH directly, as we'll end
1655 // up with bogus copies like %R9B = COPY %AH. Reference AX
1656 // instead to prevent AH references in a REX instruction.
1657 //
1658 // The current assumption of the fast register allocator is that isel
1659 // won't generate explicit references to the GPR8_NOREX registers. If
1660 // the allocator and/or the backend get enhanced to be more robust in
1661 // that regard, this can be, and should be, removed.
1662 unsigned ResultReg = 0;
1663 if ((I->getOpcode() == Instruction::SRem ||
1664 I->getOpcode() == Instruction::URem) &&
1665 OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
1666 unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
1667 unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
1668 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1669 TII.get(Copy), SourceSuperReg).addReg(X86::AX);
1670
1671 // Shift AX right by 8 bits instead of using AH.
1672 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
1673 ResultSuperReg).addReg(SourceSuperReg).addImm(8);
1674
1675 // Now reference the 8-bit subreg of the result.
1676 ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
1677 /*Kill=*/true, X86::sub_8bit);
1678 }
1679 // Copy the result out of the physreg if we haven't already.
1680 if (!ResultReg) {
1681 ResultReg = createResultReg(TypeEntry.RC);
1682 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
1683 .addReg(OpEntry.DivRemResultReg);
1684 }
1685 updateValueMap(I, ResultReg);
1686
1687 return true;
1688 }
1689
1690 /// \brief Emit a conditional move instruction (if the are supported) to lower
1691 /// the select.
1692 bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
1693 // Check if the subtarget supports these instructions.
1694 if (!Subtarget->hasCMov())
1695 return false;
1696
1697 // FIXME: Add support for i8.
1698 if (RetVT < MVT::i16 || RetVT > MVT::i64)
1699 return false;
1700
1701 const Value *Cond = I->getOperand(0);
1702 const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
1703 bool NeedTest = true;
1704 X86::CondCode CC = X86::COND_NE;
1705
1706 // Optimize conditions coming from a compare if both instructions are in the
1707 // same basic block (values defined in other basic blocks may not have
1708 // initialized registers).
1709 const auto *CI = dyn_cast(Cond);
1710 if (CI && (CI->getParent() == I->getParent())) {
1711 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1712
1713 // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
1714 static unsigned SETFOpcTable[2][3] = {
1715 { X86::SETNPr, X86::SETEr , X86::TEST8rr },
1716 { X86::SETPr, X86::SETNEr, X86::OR8rr }
1717 };
1718 unsigned *SETFOpc = nullptr;
1719 switch (Predicate) {
1720 default: break;
1721 case CmpInst::FCMP_OEQ:
1722 SETFOpc = &SETFOpcTable[0][0];
1723 Predicate = CmpInst::ICMP_NE;
1724 break;
1725 case CmpInst::FCMP_UNE:
1726 SETFOpc = &SETFOpcTable[1][0];
1727 Predicate = CmpInst::ICMP_NE;
1728 break;
1729 }
1730
1731 bool NeedSwap;
1732 std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
1733 assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
1734
1735 const Value *CmpLHS = CI->getOperand(0);
1736 const Value *CmpRHS = CI->getOperand(1);
1737 if (NeedSwap)
1738 std::swap(CmpLHS, CmpRHS);
1739
1740 EVT CmpVT = TLI.getValueType(CmpLHS->getType());
1741 // Emit a compare of the LHS and RHS, setting the flags.
1742 if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
1743 return false;
1744
1745 if (SETFOpc) {
1746 unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
1747 unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
1748 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
1749 FlagReg1);
1750 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
1751 FlagReg2);
1752 auto const &II = TII.get(SETFOpc[2]);
1753 if (II.getNumDefs()) {
1754 unsigned TmpReg = createResultReg(&X86::GR8RegClass);
1755 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
1756 .addReg(FlagReg2).addReg(FlagReg1);
1757 } else {
1758 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
1759 .addReg(FlagReg2).addReg(FlagReg1);
1760 }
1761 }
1762 NeedTest = false;
1763 } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
1764 // Fake request the condition, otherwise the intrinsic might be completely
1765 // optimized away.
1766 unsigned TmpReg = getRegForValue(Cond);
1767 if (TmpReg == 0)
1768 return false;
1769
1770 NeedTest = false;
1771 }
1772
1773 if (NeedTest) {
1774 // Selects operate on i1, however, CondReg is 8 bits width and may contain
1775 // garbage. Indeed, only the less significant bit is supposed to be
1776 // accurate. If we read more than the lsb, we may see non-zero values
1777 // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
1778 // the select. This is achieved by performing TEST against 1.
1779 unsigned CondReg = getRegForValue(Cond);
1780 if (CondReg == 0)
1781 return false;
1782 bool CondIsKill = hasTrivialKill(Cond);
1783
1784 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
1785 .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
1786 }
1787
1788 const Value *LHS = I->getOperand(1);
1789 const Value *RHS = I->getOperand(2);
1790
1791 unsigned RHSReg = getRegForValue(RHS);
1792 bool RHSIsKill = hasTrivialKill(RHS);
1793
1794 unsigned LHSReg = getRegForValue(LHS);
1795 bool LHSIsKill = hasTrivialKill(LHS);
1796
1797 if (!LHSReg || !RHSReg)
1798 return false;
1799
1800 unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
1801 unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
1802 LHSReg, LHSIsKill);
1803 updateValueMap(I, ResultReg);
1804 return true;
1805 }
1806
1807 /// \brief Emit SSE instructions to lower the select.
1808 ///
1809 /// Try to use SSE1/SSE2 instructions to simulate a select without branches.
1810 /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
1811 /// SSE instructions are available.
1812 bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
1813 // Optimize conditions coming from a compare if both instructions are in the
1814 // same basic block (values defined in other basic blocks may not have
1815 // initialized registers).
1816 const auto *CI = dyn_cast(I->getOperand(0));
1817 if (!CI || (CI->getParent() != I->getParent()))
1818 return false;
1819
1820 if (I->getType() != CI->getOperand(0)->getType() ||
1821 !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
1822 (Subtarget->hasSSE2() && RetVT == MVT::f64)))
1823 return false;
1824
1825 const Value *CmpLHS = CI->getOperand(0);
1826 const Value *CmpRHS = CI->getOperand(1);
1827 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1828
1829 // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
1830 // We don't have to materialize a zero constant for this case and can just use
1831 // %x again on the RHS.
1832 if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
1833 const auto *CmpRHSC = dyn_cast(CmpRHS);
1834 if (CmpRHSC && CmpRHSC->isNullValue())
1835 CmpRHS = CmpLHS;
1836 }
1837
1838 unsigned CC;
1839 bool NeedSwap;
1840 std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
1841 if (CC > 7)
1842 return false;
1843
1844 if (NeedSwap)
1845 std::swap(CmpLHS, CmpRHS);
1846
1847 static unsigned OpcTable[2][2][4] = {
1848 { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr },
1849 { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } },
1850 { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr },
1851 { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } }
1852 };
1853
1854 bool HasAVX = Subtarget->hasAVX();
1855 unsigned *Opc = nullptr;
1856 switch (RetVT.SimpleTy) {
1857 default: return false;
1858 case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break;
1859 case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break;
1860 }
1861
1862 const Value *LHS = I->getOperand(1);
1863 const Value *RHS = I->getOperand(2);
1864
1865 unsigned LHSReg = getRegForValue(LHS);
1866 bool LHSIsKill = hasTrivialKill(LHS);
1867
1868 unsigned RHSReg = getRegForValue(RHS);
1869 bool RHSIsKill = hasTrivialKill(RHS);
1870
1871 unsigned CmpLHSReg = getRegForValue(CmpLHS);
1872 bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
1873
1874 unsigned CmpRHSReg = getRegForValue(CmpRHS);
1875 bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
1876
1877 if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
1878 return false;
1879
1880 const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
1881 unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
1882 CmpRHSReg, CmpRHSIsKill, CC);
1883 unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
1884 LHSReg, LHSIsKill);
1885 unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
1886 RHSReg, RHSIsKill);
1887 unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
1888 AndReg, /*IsKill=*/true);
1889 updateValueMap(I, ResultReg);
1890 return true;
1891 }
1892
1893 bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
1894 // These are pseudo CMOV instructions and will be later expanded into control-
1895 // flow.
1896 unsigned Opc;
1897 switch (RetVT.SimpleTy) {
1898 default: return false;
1899 case MVT::i8: Opc = X86::CMOV_GR8; break;
1900 case MVT::i16: Opc = X86::CMOV_GR16; break;
1901 case MVT::i32: Opc = X86::CMOV_GR32; break;
1902 case MVT::f32: Opc = X86::CMOV_FR32; break;
1903 case MVT::f64: Opc = X86::CMOV_FR64; break;
1904 }
1905
1906 const Value *Cond = I->getOperand(0);
1907 X86::CondCode CC = X86::COND_NE;
1908
1909 // Optimize conditions coming from a compare if both instructions are in the
1910 // same basic block (values defined in other basic blocks may not have
1911 // initialized registers).
1912 const auto *CI = dyn_cast(Cond);
1913 if (CI && (CI->getParent() == I->getParent())) {
1914 bool NeedSwap;
1915 std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
1916 if (CC > X86::LAST_VALID_COND)
1917 return false;
1918
1919 const Value *CmpLHS = CI->getOperand(0);
1920 const Value *CmpRHS = CI->getOperand(1);
1921
1922 if (NeedSwap)
1923 std::swap(CmpLHS, CmpRHS);
1924
1925 EVT CmpVT = TLI.getValueType(CmpLHS->getType());
1926 if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
1927 return false;
1928 } else {
1929 unsigned CondReg = getRegForValue(Cond);
1930 if (CondReg == 0)
1931 return false;
1932 bool CondIsKill = hasTrivialKill(Cond);
1933 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
1934 .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
1935 }
1936
1937 const Value *LHS = I->getOperand(1);
1938 const Value *RHS = I->getOperand(2);
1939
1940 unsigned LHSReg = getRegForValue(LHS);
1941 bool LHSIsKill = hasTrivialKill(LHS);
1942
1943 unsigned RHSReg = getRegForValue(RHS);
1944 bool RHSIsKill = hasTrivialKill(RHS);
1945
1946 if (!LHSReg || !RHSReg)
1947 return false;
1948
1949 const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
1950
1951 unsigned ResultReg =
1952 fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
1953 updateValueMap(I, ResultReg);
1954 return true;
1955 }
1956
1957 bool X86FastISel::X86SelectSelect(const Instruction *I) {
1958 MVT RetVT;
1959 if (!isTypeLegal(I->getType(), RetVT))
1960 return false;
1961
1962 // Check if we can fold the select.
1963 if (const auto *CI = dyn_cast(I->getOperand(0))) {
1964 CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
1965 const Value *Opnd = nullptr;
1966 switch (Predicate) {
1967 default: break;
1968 case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
1969 case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break;
1970 }
1971 // No need for a select anymore - this is an unconditional move.
1972 if (Opnd) {
1973 unsigned OpReg = getRegForValue(Opnd);
1974 if (OpReg == 0)
1975 return false;
1976 bool OpIsKill = hasTrivialKill(Opnd);
1977 const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
1978 unsigned ResultReg = createResultReg(RC);
1979 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
1980 TII.get(TargetOpcode::COPY), ResultReg)
1981 .addReg(OpReg, getKillRegState(OpIsKill));
1982 updateValueMap(I, ResultReg);
1983 return true;
1984 }
1985 }
1986
1987 // First try to use real conditional move instructions.
1988 if (X86FastEmitCMoveSelect(RetVT, I))
1989 return true;
1990
1991 // Try to use a sequence of SSE instructions to simulate a conditional move.
1992 if (X86FastEmitSSESelect(RetVT, I))
1993 return true;
1994
1995 // Fall-back to pseudo conditional move instructions, which will be later
1996 // converted to control-flow.
1997 if (X86FastEmitPseudoSelect(RetVT, I))
1998 return true;
1999
2000 return false;
2001 }
2002
2003 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
2004 // fpext from float to double.
2005 if (X86ScalarSSEf64 &&
2006 I->getType()->isDoubleTy()) {
2007 const Value *V = I->getOperand(0);
2008 if (V->getType()->isFloatTy()) {
2009 unsigned OpReg = getRegForValue(V);
2010 if (OpReg == 0) return false;
2011 unsigned ResultReg = createResultReg(&X86::FR64RegClass);
2012 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2013 TII.get(X86::CVTSS2SDrr), ResultReg)
2014 .addReg(OpReg);
2015 updateValueMap(I, ResultReg);
2016 return true;
2017 }
2018 }
2019
2020 return false;
2021 }
2022
2023 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
2024 if (X86ScalarSSEf64) {
2025 if (I->getType()->isFloatTy()) {
2026 const Value *V = I->getOperand(0);
2027 if (V->getType()->isDoubleTy()) {
2028 unsigned OpReg = getRegForValue(V);
2029 if (OpReg == 0) return false;
2030 unsigned ResultReg = createResultReg(&X86::FR32RegClass);
2031 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2032 TII.get(X86::CVTSD2SSrr), ResultReg)
2033 .addReg(OpReg);
2034 updateValueMap(I, ResultReg);
2035 return true;
2036 }
2037 }
2038 }
2039
2040 return false;
2041 }
2042
2043 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
2044 EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
2045 EVT DstVT = TLI.getValueType(I->getType());
2046
2047 // This code only handles truncation to byte.
2048 if (DstVT != MVT::i8 && DstVT != MVT::i1)
2049 return false;
2050 if (!TLI.isTypeLegal(SrcVT))
2051 return false;
2052
2053 unsigned InputReg = getRegForValue(I->getOperand(0));
2054 if (!InputReg)
2055 // Unhandled operand. Halt "fast" selection and bail.
2056 return false;
2057
2058 if (SrcVT == MVT::i8) {
2059 // Truncate from i8 to i1; no code needed.
2060 updateValueMap(I, InputReg);
2061 return true;
2062 }
2063
2064 if (!Subtarget->is64Bit()) {
2065 // If we're on x86-32; we can't extract an i8 from a general register.
2066 // First issue a copy to GR16_ABCD or GR32_ABCD.
2067 const TargetRegisterClass *CopyRC =
2068 (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
2069 unsigned CopyReg = createResultReg(CopyRC);
2070 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2071 TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
2072 InputReg = CopyReg;
2073 }
2074
2075 // Issue an extract_subreg.
2076 unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
2077 InputReg, /*Kill=*/true,
2078 X86::sub_8bit);
2079 if (!ResultReg)
2080 return false;
2081
2082 updateValueMap(I, ResultReg);
2083 return true;
2084 }
2085
2086 bool X86FastISel::IsMemcpySmall(uint64_t Len) {
2087 return Len <= (Subtarget->is64Bit() ? 32 : 16);
2088 }
2089
2090 bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
2091 X86AddressMode SrcAM, uint64_t Len) {
2092
2093 // Make sure we don't bloat code by inlining very large memcpy's.
2094 if (!IsMemcpySmall(Len))
2095 return false;
2096
2097 bool i64Legal = Subtarget->is64Bit();
2098
2099 // We don't care about alignment here since we just emit integer accesses.
2100 while (Len) {
2101 MVT VT;
2102 if (Len >= 8 && i64Legal)
2103 VT = MVT::i64;
2104 else if (Len >= 4)
2105 VT = MVT::i32;
2106 else if (Len >= 2)
2107 VT = MVT::i16;
2108 else
2109 VT = MVT::i8;
2110
2111 unsigned Reg;
2112 bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
2113 RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
2114 assert(RV && "Failed to emit load or store??");
2115
2116 unsigned Size = VT.getSizeInBits()/8;
2117 Len -= Size;
2118 DestAM.Disp += Size;
2119 SrcAM.Disp += Size;
2120 }
2121
2122 return true;
2123 }
2124
2125 bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
2126 // FIXME: Handle more intrinsics.
2127 switch (II->getIntrinsicID()) {
2128 default: return false;
2129 case Intrinsic::frameaddress: {
2130 Type *RetTy = II->getCalledFunction()->getReturnType();
2131
2132 MVT VT;
2133 if (!isTypeLegal(RetTy, VT))
2134 return false;
2135
2136 unsigned Opc;
2137 const TargetRegisterClass *RC = nullptr;
2138
2139 switch (VT.SimpleTy) {
2140 default: llvm_unreachable("Invalid result type for frameaddress.");
2141 case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
2142 case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
2143 }
2144
2145 // This needs to be set before we call getPtrSizedFrameRegister, otherwise
2146 // we get the wrong frame register.
2147 MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
2148 MFI->setFrameAddressIsTaken(true);
2149
2150 const X86RegisterInfo *RegInfo = static_cast(
2151 TM.getSubtargetImpl()->getRegisterInfo());
2152 unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF));
2153 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
2154 (FrameReg == X86::EBP && VT == MVT::i32)) &&
2155 "Invalid Frame Register!");
2156
2157 // Always make a copy of the frame register to to a vreg first, so that we
2158 // never directly reference the frame register (the TwoAddressInstruction-
2159 // Pass doesn't like that).
2160 unsigned SrcReg = createResultReg(RC);
2161 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2162 TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
2163
2164 // Now recursively load from the frame address.
2165 // movq (%rbp), %rax
2166 // movq (%rax), %rax
2167 // movq (%rax), %rax
2168 // ...
2169 unsigned DestReg;
2170 unsigned Depth = cast(II->getOperand(0))->getZExtValue();
2171 while (Depth--) {
2172 DestReg = createResultReg(RC);
2173 addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2174 TII.get(Opc), DestReg), SrcReg);
2175 SrcReg = DestReg;
2176 }
2177
2178 updateValueMap(II, SrcReg);
2179 return true;
2180 }
2181 case Intrinsic::memcpy: {
2182 const MemCpyInst *MCI = cast(II);
2183 // Don't handle volatile or variable length memcpys.
2184 if (MCI->isVolatile())
2185 return false;
2186
2187 if (isa(MCI->getLength())) {
2188 // Small memcpy's are common enough that we want to do them
2189 // without a call if possible.
2190 uint64_t Len = cast(MCI->getLength())->getZExtValue();
2191 if (IsMemcpySmall(Len)) {
2192 X86AddressMode DestAM, SrcAM;
2193 if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
2194 !X86SelectAddress(MCI->getRawSource(), SrcAM))
2195 return false;
2196 TryEmitSmallMemcpy(DestAM, SrcAM, Len);
2197 return true;
2198 }
2199 }
2200
2201 unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
2202 if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
2203 return false;
2204
2205 if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
2206 return false;
2207
2208 return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
2209 }
2210 case Intrinsic::memset: {
2211 const MemSetInst *MSI = cast(II);
2212
2213 if (MSI->isVolatile())
2214 return false;
2215
2216 unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
2217 if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
2218 return false;
2219
2220 if (MSI->getDestAddressSpace() > 255)
2221 return false;
2222
2223 return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
2224 }
2225 case Intrinsic::stackprotector: {
2226 // Emit code to store the stack guard onto the stack.
2227 EVT PtrTy = TLI.getPointerTy();
2228
2229 const Value *Op1 = II->getArgOperand(0); // The guard's value.
2230 const AllocaInst *Slot = cast(II->getArgOperand(1));
2231
2232 MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
2233
2234 // Grab the frame index.
2235 X86AddressMode AM;
2236 if (!X86SelectAddress(Slot, AM)) return false;
2237 if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
2238 return true;
2239 }
2240 case Intrinsic::dbg_declare: {
2241 const DbgDeclareInst *DI = cast(II);
2242 X86AddressMode AM;
2243 assert(DI->getAddress() && "Null address should be checked earlier!");
2244 if (!X86SelectAddress(DI->getAddress(), AM))
2245 return false;
2246 const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
2247 // FIXME may need to add RegState::Debug to any registers produced,
2248 // although ESP/EBP should be the only ones at the moment.
2249 addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
2250 .addImm(0)
2251 .addMetadata(DI->getVariable())
2252 .addMetadata(DI->getExpression());
2253 return true;
2254 }
2255 case Intrinsic::trap: {
2256 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
2257 return true;
2258 }
2259 case Intrinsic::sqrt: {
2260 if (!Subtarget->hasSSE1())
2261 return false;
2262
2263 Type *RetTy = II->getCalledFunction()->getReturnType();
2264
2265 MVT VT;
2266 if (!isTypeLegal(RetTy, VT))
2267 return false;
2268
2269 // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
2270 // is not generated by FastISel yet.
2271 // FIXME: Update this code once tablegen can handle it.
2272 static const unsigned SqrtOpc[2][2] = {
2273 {X86::SQRTSSr, X86::VSQRTSSr},
2274 {X86::SQRTSDr, X86::VSQRTSDr}
2275 };
2276 bool HasAVX = Subtarget->hasAVX();
2277 unsigned Opc;
2278 const TargetRegisterClass *RC;
2279 switch (VT.SimpleTy) {
2280 default: return false;
2281 case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
2282 case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
2283 }
2284
2285 const Value *SrcVal = II->getArgOperand(0);
2286 unsigned SrcReg = getRegForValue(SrcVal);
2287
2288 if (SrcReg == 0)
2289 return false;
2290
2291 unsigned ImplicitDefReg = 0;
2292 if (HasAVX) {
2293 ImplicitDefReg = createResultReg(RC);
2294 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2295 TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
2296 }
2297
2298 unsigned ResultReg = createResultReg(RC);
2299 MachineInstrBuilder MIB;
2300 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
2301 ResultReg);
2302
2303 if (ImplicitDefReg)
2304 MIB.addReg(ImplicitDefReg);
2305
2306 MIB.addReg(SrcReg);
2307
2308 updateValueMap(II, ResultReg);
2309 return true;
2310 }
2311 case Intrinsic::sadd_with_overflow:
2312 case Intrinsic::uadd_with_overflow:
2313 case Intrinsic::ssub_with_overflow:
2314 case Intrinsic::usub_with_overflow:
2315 case Intrinsic::smul_with_overflow:
2316 case Intrinsic::umul_with_overflow: {
2317 // This implements the basic lowering of the xalu with overflow intrinsics
2318 // into add/sub/mul followed by either seto or setb.
2319 const Function *Callee = II->getCalledFunction();
2320 auto *Ty = cast(Callee->getReturnType());
2321 Type *RetTy = Ty->getTypeAtIndex(0U);
2322 Type *CondTy = Ty->getTypeAtIndex(1);
2323
2324 MVT VT;
2325 if (!isTypeLegal(RetTy, VT))
2326 return false;
2327
2328 if (VT < MVT::i8 || VT > MVT::i64)
2329 return false;
2330
2331 const Value *LHS = II->getArgOperand(0);
2332 const Value *RHS = II->getArgOperand(1);
2333
2334 // Canonicalize immediate to the RHS.
2335 if (isa(LHS) && !isa(RHS) &&
2336 isCommutativeIntrinsic(II))
2337 std::swap(LHS, RHS);
2338
2339 bool UseIncDec = false;
2340 if (isa(RHS) && cast(RHS)->isOne())
2341 UseIncDec = true;
2342
2343 unsigned BaseOpc, CondOpc;
2344 switch (II->getIntrinsicID()) {
2345 default: llvm_unreachable("Unexpected intrinsic!");
2346 case Intrinsic::sadd_with_overflow:
2347 BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
2348 CondOpc = X86::SETOr;
2349 break;
2350 case Intrinsic::uadd_with_overflow:
2351 BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
2352 case Intrinsic::ssub_with_overflow:
2353 BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
2354 CondOpc = X86::SETOr;
2355 break;
2356 case Intrinsic::usub_with_overflow:
2357 BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
2358 case Intrinsic::smul_with_overflow:
2359 BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
2360 case Intrinsic::umul_with_overflow:
2361 BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
2362 }
2363
2364 unsigned LHSReg = getRegForValue(LHS);
2365 if (LHSReg == 0)
2366 return false;
2367 bool LHSIsKill = hasTrivialKill(LHS);
2368
2369 unsigned ResultReg = 0;
2370 // Check if we have an immediate version.
2371 if (const auto *CI = dyn_cast(RHS)) {
2372 static const unsigned Opc[2][4] = {
2373 { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
2374 { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
2375 };
2376
2377 if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
2378 ResultReg = createResultReg(TLI.getRegClassFor(VT));
2379 bool IsDec = BaseOpc == X86ISD::DEC;
2380 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2381 TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
2382 .addReg(LHSReg, getKillRegState(LHSIsKill));
2383 } else
2384 ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
2385 CI->getZExtValue());
2386 }
2387
2388 unsigned RHSReg;
2389 bool RHSIsKill;
2390 if (!ResultReg) {
2391 RHSReg = getRegForValue(RHS);
2392 if (RHSReg == 0)
2393 return false;
2394 RHSIsKill = hasTrivialKill(RHS);
2395 ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
2396 RHSIsKill);
2397 }
2398
2399 // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
2400 // it manually.
2401 if (BaseOpc == X86ISD::UMUL && !ResultReg) {
2402 static const unsigned MULOpc[] =
2403 { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
2404 static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
2405 // First copy the first operand into RAX, which is an implicit input to
2406 // the X86::MUL*r instruction.
2407 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2408 TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
2409 .addReg(LHSReg, getKillRegState(LHSIsKill));
2410 ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
2411 TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
2412 } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
2413 static const unsigned MULOpc[] =
2414 { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
2415 if (VT == MVT::i8) {
2416 // Copy the first operand into AL, which is an implicit input to the
2417 // X86::IMUL8r instruction.
2418 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2419 TII.get(TargetOpcode::COPY), X86::AL)
2420 .addReg(LHSReg, getKillRegState(LHSIsKill));
2421 ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
2422 RHSIsKill);
2423 } else
2424 ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
2425 TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
2426 RHSReg, RHSIsKill);
2427 }
2428
2429 if (!ResultReg)
2430 return false;
2431
2432 unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
2433 assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
2434 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
2435 ResultReg2);
2436
2437 updateValueMap(II, ResultReg, 2);
2438 return true;
2439 }
2440 case Intrinsic::x86_sse_cvttss2si:
2441 case Intrinsic::x86_sse_cvttss2si64:
2442 case Intrinsic::x86_sse2_cvttsd2si:
2443 case Intrinsic::x86_sse2_cvttsd2si64: {
2444 bool IsInputDouble;
2445 switch (II->getIntrinsicID()) {
2446 default: llvm_unreachable("Unexpected intrinsic.");
2447 case Intrinsic::x86_sse_cvttss2si:
2448 case Intrinsic::x86_sse_cvttss2si64:
2449 if (!Subtarget->hasSSE1())
2450 return false;
2451 IsInputDouble = false;
2452 break;
2453 case Intrinsic::x86_sse2_cvttsd2si:
2454 case Intrinsic::x86_sse2_cvttsd2si64:
2455 if (!Subtarget->hasSSE2())
2456 return false;
2457 IsInputDouble = true;
2458 break;
2459 }
2460
2461 Type *RetTy = II->getCalledFunction()->getReturnType();
2462 MVT VT;
2463 if (!isTypeLegal(RetTy, VT))
2464 return false;
2465
2466 static const unsigned CvtOpc[2][2][2] = {
2467 { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr },
2468 { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } },
2469 { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr },
2470 { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } }
2471 };
2472 bool HasAVX = Subtarget->hasAVX();
2473 unsigned Opc;
2474 switch (VT.SimpleTy) {
2475 default: llvm_unreachable("Unexpected result type.");
2476 case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
2477 case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
2478 }
2479
2480 // Check if we can fold insertelement instructions into the convert.
2481 const Value *Op = II->getArgOperand(0);
2482 while (auto *IE = dyn_cast(Op)) {
2483 const Value *Index = IE->getOperand(2);
2484 if (!isa(Index))
2485 break;
2486 unsigned Idx = cast(Index)->getZExtValue();
2487
2488 if (Idx == 0) {
2489 Op = IE->getOperand(1);
2490 break;
2491 }
2492 Op = IE->getOperand(0);
2493 }
2494
2495 unsigned Reg = getRegForValue(Op);
2496 if (Reg == 0)
2497 return false;
2498
2499 unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
2500 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
2501 .addReg(Reg);
2502
2503 updateValueMap(II, ResultReg);
2504 return true;
2505 }
2506 }
2507 }
2508
2509 bool X86FastISel::fastLowerArguments() {
2510 if (!FuncInfo.CanLowerReturn)
2511 return false;
2512
2513 const Function *F = FuncInfo.Fn;
2514 if (F->isVarArg())
2515 return false;
2516
2517 CallingConv::ID CC = F->getCallingConv();
2518 if (CC != CallingConv::C)
2519 return false;
2520
2521 if (Subtarget->isCallingConvWin64(CC))
2522 return false;
2523
2524 if (!Subtarget->is64Bit())
2525 return false;
2526
2527 // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
2528 unsigned GPRCnt = 0;
2529 unsigned FPRCnt = 0;
2530 unsigned Idx = 0;
2531 for (auto const &Arg : F->args()) {
2532 // The first argument is at index 1.
2533 ++Idx;
2534 if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
2535 F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
2536 F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
2537 F->getAttributes().hasAttribute(Idx, Attribute::Nest))
2538 return false;
2539
2540 Type *ArgTy = Arg.getType();
2541 if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
2542 return false;
2543
2544 EVT ArgVT = TLI.getValueType(ArgTy);
2545 if (!ArgVT.isSimple()) return false;
2546 switch (ArgVT.getSimpleVT().SimpleTy) {
2547 default: return false;
2548 case MVT::i32:
2549 case MVT::i64:
2550 ++GPRCnt;
2551 break;
2552 case MVT::f32:
2553 case MVT::f64:
2554 if (!Subtarget->hasSSE1())
2555 return false;
2556 ++FPRCnt;
2557 break;
2558 }
2559
2560 if (GPRCnt > 6)
2561 return false;
2562
2563 if (FPRCnt > 8)
2564 return false;
2565 }
2566
2567 static const MCPhysReg GPR32ArgRegs[] = {
2568 X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
2569 };
2570 static const MCPhysReg GPR64ArgRegs[] = {
2571 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
2572 };
2573 static const MCPhysReg XMMArgRegs[] = {
2574 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2575 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2576 };
2577
2578 unsigned GPRIdx = 0;
2579 unsigned FPRIdx = 0;
2580 for (auto const &Arg : F->args()) {
2581 MVT VT = TLI.getSimpleValueType(Arg.getType());
2582 const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
2583 unsigned SrcReg;
2584 switch (VT.SimpleTy) {
2585 default: llvm_unreachable("Unexpected value type.");
2586 case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
2587 case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
2588 case MVT::f32: // fall-through
2589 case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
2590 }
2591 unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
2592 // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
2593 // Without this, EmitLiveInCopies may eliminate the livein if its only
2594 // use is a bitcast (which isn't turned into an instruction).
2595 unsigned ResultReg = createResultReg(RC);
2596 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2597 TII.get(TargetOpcode::COPY), ResultReg)
2598 .addReg(DstReg, getKillRegState(true));
2599 updateValueMap(&Arg, ResultReg);
2600 }
2601 return true;
2602 }
2603
2604 static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
2605 CallingConv::ID CC,
2606 ImmutableCallSite *CS) {
2607 if (Subtarget->is64Bit())
2608 return 0;
2609 if (Subtarget->getTargetTriple().isOSMSVCRT())
2610 return 0;
2611 if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2612 CC == CallingConv::HiPE)
2613 return 0;
2614 if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
2615 return 0;
2616 if (CS && CS->paramHasAttr(1, Attribute::InReg))
2617 return 0;
2618 return 4;
2619 }
2620
2621 bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
2622 auto &OutVals = CLI.OutVals;
2623 auto &OutFlags = CLI.OutFlags;
2624 auto &OutRegs = CLI.OutRegs;
2625 auto &Ins = CLI.Ins;
2626 auto &InRegs = CLI.InRegs;
2627 CallingConv::ID CC = CLI.CallConv;
2628 bool &IsTailCall = CLI.IsTailCall;
2629 bool IsVarArg = CLI.IsVarArg;
2630 const Value *Callee = CLI.Callee;
2631 const char *SymName = CLI.SymName;
2632
2633 bool Is64Bit = Subtarget->is64Bit();
2634 bool IsWin64 = Subtarget->isCallingConvWin64(CC);
2635
2636 // Handle only C, fastcc, and webkit_js calling conventions for now.
2637 switch (CC) {
2638 default: return false;
2639 case CallingConv::C:
2640 case CallingConv::Fast:
2641 case CallingConv::WebKit_JS:
2642 case CallingConv::X86_FastCall:
2643 case CallingConv::X86_64_Win64:
2644 case CallingConv::X86_64_SysV:
2645 break;
2646 }
2647
2648 // Allow SelectionDAG isel to handle tail calls.
2649 if (IsTailCall)
2650 return false;
2651
2652 // fastcc with -tailcallopt is intended to provide a guaranteed
2653 // tail call optimization. Fastisel doesn't know how to do that.
2654 if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
2655 return false;
2656
2657 // Don't know how to handle Win64 varargs yet. Nothing special needed for
2658 // x86-32. Special handling for x86-64 is implemented.
2659 if (IsVarArg && IsWin64)
2660 return false;
2661
2662 // Don't know about inalloca yet.
2663 if (CLI.CS && CLI.CS->hasInAllocaArgument())
2664 return false;
2665
2666 // Fast-isel doesn't know about callee-pop yet.
2667 if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
2668 TM.Options.GuaranteedTailCallOpt))
2669 return false;
2670
2671 SmallVector OutVTs;
2672 SmallVector ArgRegs;
2673
2674 // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
2675 // instruction. This is safe because it is common to all FastISel supported
2676 // calling conventions on x86.
2677 for (int i = 0, e = OutVals.size(); i != e; ++i) {
2678 Value *&Val = OutVals[i];
2679 ISD::ArgFlagsTy Flags = OutFlags[i];
2680 if (auto *CI = dyn_cast(Val)) {
2681 if (CI->getBitWidth() < 32) {
2682 if (Flags.isSExt())
2683 Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
2684 else
2685 Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
2686 }
2687 }
2688
2689 // Passing bools around ends up doing a trunc to i1 and passing it.
2690 // Codegen this as an argument + "and 1".
2691 MVT VT;
2692 auto *TI = dyn_cast(Val);
2693 unsigned ResultReg;
2694 if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
2695 (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
2696 TI->hasOneUse()) {
2697 Value *PrevVal = TI->getOperand(0);
2698 ResultReg = getRegForValue(PrevVal);
2699
2700 if (!ResultReg)
2701 return false;
2702
2703 if (!isTypeLegal(PrevVal->getType(), VT))
2704 return false;
2705
2706 ResultReg =
2707 fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
2708 } else {
2709 if (!isTypeLegal(Val->getType(), VT))
2710 return false;
2711 ResultReg = getRegForValue(Val);
2712 }
2713
2714 if (!ResultReg)
2715 return false;
2716
2717 ArgRegs.push_back(ResultReg);
2718 OutVTs.push_back(VT);
2719 }
2720
2721 // Analyze operands of the call, assigning locations to each operand.
2722 SmallVector ArgLocs;
2723 CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
2724
2725 // Allocate shadow area for Win64
2726 if (IsWin64)
2727 CCInfo.AllocateStack(32, 8);
2728
2729 CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
2730
2731 // Get a count of how many bytes are to be pushed on the stack.
2732 unsigned NumBytes = CCInfo.getNextStackOffset();
2733
2734 // Issue CALLSEQ_START
2735 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
2736 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
2737 .addImm(NumBytes);
2738
2739 // Walk the register/memloc assignments, inserting copies/loads.
2740 const X86RegisterInfo *RegInfo = static_cast(
2741 TM.getSubtargetImpl()->getRegisterInfo());
2742 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2743 CCValAssign const &VA = ArgLocs[i];
2744 const Value *ArgVal = OutVals[VA.getValNo()];
2745 MVT ArgVT = OutVTs[VA.getValNo()];
2746
2747 if (ArgVT == MVT::x86mmx)
2748 return false;
2749
2750 unsigned ArgReg = ArgRegs[VA.getValNo()];
2751
2752 // Promote the value if needed.
2753 switch (VA.getLocInfo()) {
2754 case CCValAssign::Full: break;
2755 case CCValAssign::SExt: {
2756 assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
2757 "Unexpected extend");
2758 bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
2759 ArgVT, ArgReg);
2760 assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
2761 ArgVT = VA.getLocVT();
2762 break;
2763 }
2764 case CCValAssign::ZExt: {
2765 assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
2766 "Unexpected extend");
2767 bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
2768 ArgVT, ArgReg);
2769 assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
2770 ArgVT = VA.getLocVT();
2771 break;
2772 }
2773 case CCValAssign::AExt: {
2774 assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
2775 "Unexpected extend");
2776 bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
2777 ArgVT, ArgReg);
2778 if (!Emitted)
2779 Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
2780 ArgVT, ArgReg);
2781 if (!Emitted)
2782 Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
2783 ArgVT, ArgReg);
2784
2785 assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
2786 ArgVT = VA.getLocVT();
2787 break;
2788 }
2789 case CCValAssign::BCvt: {
2790 ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
2791 /*TODO: Kill=*/false);
2792 assert(ArgReg && "Failed to emit a bitcast!");
2793 ArgVT = VA.getLocVT();
2794 break;
2795 }
2796 case CCValAssign::VExt:
2797 // VExt has not been implemented, so this should be impossible to reach
2798 // for now. However, fallback to Selection DAG isel once implemented.
2799 return false;
2800 case CCValAssign::AExtUpper:
2801 case CCValAssign::SExtUpper:
2802 case CCValAssign::ZExtUpper:
2803 case CCValAssign::FPExt:
2804 llvm_unreachable("Unexpected loc info!");
2805 case CCValAssign::Indirect:
2806 // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
2807 // support this.
2808 return false;
2809 }
2810
2811 if (VA.isRegLoc()) {
2812 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2813 TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
2814 OutRegs.push_back(VA.getLocReg());
2815 } else {
2816 assert(VA.isMemLoc());
2817
2818 // Don't emit stores for undef values.
2819 if (isa(ArgVal))
2820 continue;
2821
2822 unsigned LocMemOffset = VA.getLocMemOffset();
2823 X86AddressMode AM;
2824 AM.Base.Reg = RegInfo->getStackRegister();
2825 AM.Disp = LocMemOffset;
2826 ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
2827 unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
2828 MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
2829 MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore,
2830 ArgVT.getStoreSize(), Alignment);
2831 if (Flags.isByVal()) {
2832 X86AddressMode SrcAM;
2833 SrcAM.Base.Reg = ArgReg;
2834 if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
2835 return false;
2836 } else if (isa(ArgVal) || isa(ArgVal)) {
2837 // If this is a really simple value, emit this with the Value* version
2838 // of X86FastEmitStore. If it isn't simple, we don't want to do this,
2839 // as it can cause us to reevaluate the argument.
2840 if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
2841 return false;
2842 } else {
2843 bool ValIsKill = hasTrivialKill(ArgVal);
2844 if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
2845 return false;
2846 }
2847 }
2848 }
2849
2850 // ELF / PIC requires GOT in the EBX register before function calls via PLT
2851 // GOT pointer.
2852 if (Subtarget->isPICStyleGOT()) {
2853 unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
2854 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2855 TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
2856 }
2857
2858 if (Is64Bit && IsVarArg && !IsWin64) {
2859 // From AMD64 ABI document:
2860 // For calls that may call functions that use varargs or stdargs
2861 // (prototype-less calls or calls to functions containing ellipsis (...) in
2862 // the declaration) %al is used as hidden argument to specify the number
2863 // of SSE registers used. The contents of %al do not need to match exactly
2864 // the number of registers, but must be an ubound on the number of SSE
2865 // registers used and is in the range 0 - 8 inclusive.
2866
2867 // Count the number of XMM registers allocated.
2868 static const MCPhysReg XMMArgRegs[] = {
2869 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2870 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2871 };
2872 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
2873 assert((Subtarget->hasSSE1() || !NumXMMRegs)
2874 && "SSE registers cannot be used when SSE is disabled");
2875 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
2876 X86::AL).addImm(NumXMMRegs);
2877 }
2878
2879 // Materialize callee address in a register. FIXME: GV address can be
2880 // handled with a CALLpcrel32 instead.
2881 X86AddressMode CalleeAM;
2882 if (!X86SelectCallAddress(Callee, CalleeAM))
2883 return false;
2884
2885 unsigned CalleeOp = 0;
2886 const GlobalValue *GV = nullptr;
2887 if (CalleeAM.GV != nullptr) {
2888 GV = CalleeAM.GV;
2889 } else if (CalleeAM.Base.Reg != 0) {
2890 CalleeOp = CalleeAM.Base.Reg;
2891 } else
2892 return false;
2893
2894 // Issue the call.
2895 MachineInstrBuilder MIB;
2896 if (CalleeOp) {
2897 // Register-indirect call.
2898 unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
2899 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
2900 .addReg(CalleeOp);
2901 } else {
2902 // Direct call.
2903 assert(GV && "Not a direct call");
2904 unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
2905
2906 // See if we need any target-specific flags on the GV operand.
2907 unsigned char OpFlags = 0;
2908
2909 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2910 // external symbols most go through the PLT in PIC mode. If the symbol
2911 // has hidden or protected visibility, or if it is static or local, then
2912 // we don't need to use the PLT - we can directly call it.
2913 if (Subtarget->isTargetELF() &&
2914 TM.getRelocationModel() == Reloc::PIC_ &&
2915 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2916 OpFlags = X86II::MO_PLT;
2917 } else if (Subtarget->isPICStyleStubAny() &&
2918 (GV->isDeclaration() || GV->isWeakForLinker()) &&
2919 (!Subtarget->getTargetTriple().isMacOSX() ||
2920 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
2921 // PC-relative references to external symbols should go through $stub,
2922 // unless we're building with the leopard linker or later, which
2923 // automatically synthesizes these stubs.
2924 OpFlags = X86II::MO_DARWIN_STUB;
2925 }
2926
2927 MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
2928 if (SymName)
2929 MIB.addExternalSymbol(SymName, OpFlags);
2930 else
2931 MIB.addGlobalAddress(GV, 0, OpFlags);
2932 }
2933
2934 // Add a register mask operand representing the call-preserved registers.
2935 // Proper defs for return values will be added by setPhysRegsDeadExcept().
2936 MIB.addRegMask(TRI.getCallPreservedMask(CC));
2937
2938 // Add an implicit use GOT pointer in EBX.
2939 if (Subtarget->isPICStyleGOT())
2940 MIB.addReg(X86::EBX, RegState::Implicit);
2941
2942 if (Is64Bit && IsVarArg && !IsWin64)
2943 MIB.addReg(X86::AL, RegState::Implicit);
2944
2945 // Add implicit physical register uses to the call.
2946 for (auto Reg : OutRegs)
2947 MIB.addReg(Reg, RegState::Implicit);
2948
2949 // Issue CALLSEQ_END
2950 unsigned NumBytesForCalleeToPop =
2951 computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
2952 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
2953 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
2954 .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
2955
2956 // Now handle call return values.
2957 SmallVector RVLocs;
2958 CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
2959 CLI.RetTy->getContext());
2960 CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
2961
2962 // Copy all of the result registers out of their specified physreg.
2963 unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
2964 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2965 CCValAssign &VA = RVLocs[i];
2966 EVT CopyVT = VA.getValVT();
2967 unsigned CopyReg = ResultReg + i;
2968
2969 // If this is x86-64, and we disabled SSE, we can't return FP values
2970 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2971 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2972 report_fatal_error("SSE register return with SSE disabled");
2973 }
2974
2975 // If we prefer to use the value in xmm registers, copy it out as f80 and
2976 // use a truncate to move it from fp stack reg to xmm reg.
2977 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2978 isScalarFPTypeInSSEReg(VA.getValVT())) {
2979 CopyVT = MVT::f80;
2980 CopyReg = createResultReg(&X86::RFP80RegClass);
2981 }
2982
2983 // Copy out the result.
2984 BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2985 TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
2986 InRegs.push_back(VA.getLocReg());
2987
2988 // Round the f80 to the right size, which also moves it to the appropriate
2989 // xmm register. This is accomplished by storing the f80 value in memory
2990 // and then loading it back.
2991 if (CopyVT != VA.getValVT()) {
2992 EVT ResVT = VA.getValVT();
2993 unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
2994 unsigned MemSize = ResVT.getSizeInBits()/8;
2995 int FI = MFI.CreateStackObject(MemSize, MemSize, false);
2996 addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
2997 TII.get(Opc)), FI)
2998 .addReg(CopyReg);
2999 Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
3000 addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
3001 TII.get(Opc), ResultReg + i), FI);
3002 }
3003 }
3004
3005 CLI.ResultReg = ResultReg;
3006 CLI.NumResultRegs = RVLocs.size();
3007 CLI.Call = MIB;
3008
3009 return true;
3010 }
3011
3012 bool
3013 X86FastISel::fastSelectInstruction(const Instruction *I) {
3014 switch (I->getOpcode()) {
3015 default: break;
3016 case Instruction::Load:
3017 return X86SelectLoad(I);
3018 case Instruction::Store:
3019 return X86SelectStore(I);
3020 case Instruction::Ret:
3021 return X86SelectRet(I);
3022 case Instruction::ICmp:
3023 case Instruction::FCmp:
3024 return X86SelectCmp(I);
3025 case Instruction::ZExt:
3026 return X86SelectZExt(I);
3027 case Instruction::Br:
3028 return X86SelectBranch(I);
3029 case Instruction::LShr:
3030 case Instruction::AShr:
3031 case Instruction::Shl:
3032 return X86SelectShift(I);
3033 case Instruction::SDiv:
3034 case Instruction::UDiv:
3035 case Instruction::SRem:
3036 case Instruction::URem:
3037 return X86SelectDivRem(I);
3038 case Instruction::Select:
3039 return X86SelectSelect(I);
3040 case Instruction::Trunc:
3041 return X86SelectTrunc(I);
3042 case Instruction::FPExt:
3043 return X86SelectFPExt(I);
3044 case Instruction::FPTrunc:
3045 return X86SelectFPTrunc(I);
3046 case Instruction::IntToPtr: // Deliberate fall-through.
3047 case Instruction::PtrToInt: {
3048 EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
3049 EVT DstVT = TLI.getValueType(I->getType());
3050 if (DstVT.bitsGT(SrcVT))
3051 return X86SelectZExt(I);
3052 if (DstVT.bitsLT(SrcVT))
3053 return X86SelectTrunc(I);
3054 unsigned Reg = getRegForValue(I->getOperand(0));
3055 if (Reg == 0) return false;
3056 updateValueMap(I, Reg);
3057 return true;
3058 }
3059 }
3060
3061 return false;
3062 }
3063
3064 unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
3065 if (VT > MVT::i64)
3066 return 0;
3067
3068 uint64_t Imm = CI->getZExtValue();
3069 if (Imm == 0) {
3070 unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
3071 switch (VT.SimpleTy) {
3072 default: llvm_unreachable("Unexpected value type");
3073 case MVT::i1:
3074