llvm.org GIT mirror llvm / 4c3428b
Elide argument copies during instruction selection Summary: Avoids tons of prologue boilerplate when arguments are passed in memory and left in memory. This can happen in a debug build or in a release build when an argument alloca is escaped. This will dramatically affect the code size of x86 debug builds, because X86 fast isel doesn't handle arguments passed in memory at all. It only handles the x86_64 case of up to 6 basic register parameters. This is implemented by analyzing the entry block before ISel to identify copy elision candidates. A copy elision candidate is an argument that is used to fully initialize an alloca before any other possibly escaping uses of that alloca. If an argument is a copy elision candidate, we set a flag on the InputArg. If the the target generates loads from a fixed stack object that matches the size and alignment requirements of the alloca, the SelectionDAG builder will delete the stack object created for the alloca and replace it with the fixed stack object. The load is left behind to satisfy any remaining uses of the argument value. The store is now dead and is therefore elided. The fixed stack object is also marked as mutable, as it may now be modified by the user, and it would be invalid to rematerialize the initial load from it. Supersedes D28388 Fixes PR26328 Reviewers: chandlerc, MatzeB, qcolombet, inglorion, hans Subscribers: igorb, llvm-commits Differential Revision: https://reviews.llvm.org/D29668 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296683 91177308-0d34-0410-b5e6-96231b3b80d8 Reid Kleckner 2 years ago
16 changed file(s) with 693 addition(s) and 95 deletion(s). Raw diff Collapse all Expand all
558558 return Objects[ObjectIdx+NumFixedObjects].isAliased;
559559 }
560560
561 /// isImmutableObjectIndex - Returns true if the specified index corresponds
562 /// to an immutable object.
561 /// Returns true if the specified index corresponds to an immutable object.
563562 bool isImmutableObjectIndex(int ObjectIdx) const {
564563 // Tail calling functions can clobber their function arguments.
565564 if (HasTailCall)
567566 assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
568567 "Invalid Object Idx!");
569568 return Objects[ObjectIdx+NumFixedObjects].isImmutable;
569 }
570
571 /// Marks the immutability of an object.
572 void setIsImmutableObjectIndex(int ObjectIdx, bool Immutable) {
573 assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
574 "Invalid Object Idx!");
575 Objects[ObjectIdx+NumFixedObjects].isImmutable = Immutable;
570576 }
571577
572578 /// Returns true if the specified index corresponds to a spill slot.
5353 const TargetInstrInfo *TII;
5454 const TargetLowering *TLI;
5555 bool FastISelFailed;
56 SmallPtrSet ElidedArgCopyInstrs;
5657
5758 static char ID;
5859
4444 unsigned OrigAlign : 5; ///< Log 2 of original alignment
4545 unsigned IsInConsecutiveRegsLast : 1;
4646 unsigned IsInConsecutiveRegs : 1;
47 unsigned IsCopyElisionCandidate : 1; ///< Argument copy elision candidate
4748
4849 unsigned ByValSize; ///< Byval struct size
4950
5354 IsReturned(0), IsSplit(0), IsInAlloca(0), IsSplitEnd(0),
5455 IsSwiftSelf(0), IsSwiftError(0), IsHva(0), IsHvaStart(0),
5556 IsSecArgPass(0), ByValAlign(0), OrigAlign(0),
56 IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0), ByValSize(0) {
57 IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0),
58 IsCopyElisionCandidate(0), ByValSize(0) {
5759 static_assert(sizeof(*this) == 2 * sizeof(unsigned), "flags are too big");
5860 }
5961
107109
108110 bool isSplitEnd() const { return IsSplitEnd; }
109111 void setSplitEnd() { IsSplitEnd = 1; }
112
113 bool isCopyElisionCandidate() const { return IsCopyElisionCandidate; }
114 void setCopyElisionCandidate() { IsCopyElisionCandidate = 1; }
110115
111116 unsigned getByValAlign() const { return (1U << ByValAlign) / 2; }
112117 void setByValAlign(unsigned A) {
8888 assert(!MInsn && "Already initialized?");
8989
9090 assert((!E || E->isValid()) && "Expected valid expression");
91 assert(~FI && "Expected valid index");
91 assert(FI != INT_MAX && "Expected valid index");
9292
9393 FrameIndexExprs.push_back({FI, E});
9494 }
80278027 return true;
80288028 }
80298029
8030 typedef DenseMap
8031 std::pair>
8032 ArgCopyElisionMapTy;
8033
8034 /// Scan the entry block of the function in FuncInfo for arguments that look
8035 /// like copies into a local alloca. Record any copied arguments in
8036 /// ArgCopyElisionCandidates.
8037 static void
8038 findArgumentCopyElisionCandidates(const DataLayout &DL,
8039 FunctionLoweringInfo *FuncInfo,
8040 ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
8041 // Record the state of every static alloca used in the entry block. Argument
8042 // allocas are all used in the entry block, so we need approximately as many
8043 // entries as we have arguments.
8044 enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
8045 SmallDenseMap StaticAllocas;
8046 unsigned NumArgs = FuncInfo->Fn->getArgumentList().size();
8047 StaticAllocas.reserve(NumArgs * 2);
8048
8049 auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * {
8050 if (!V)
8051 return nullptr;
8052 V = V->stripPointerCasts();
8053 const auto *AI = dyn_cast(V);
8054 if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI))
8055 return nullptr;
8056 auto Iter = StaticAllocas.insert({AI, Unknown});
8057 return &Iter.first->second;
8058 };
8059
8060 // Look for stores of arguments to static allocas. Look through bitcasts and
8061 // GEPs to handle type coercions, as long as the alloca is fully initialized
8062 // by the store. Any non-store use of an alloca escapes it and any subsequent
8063 // unanalyzed store might write it.
8064 // FIXME: Handle structs initialized with multiple stores.
8065 for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
8066 // Look for stores, and handle non-store uses conservatively.
8067 const auto *SI = dyn_cast(&I);
8068 if (!SI) {
8069 // We will look through cast uses, so ignore them completely.
8070 if (I.isCast())
8071 continue;
8072 // Ignore debug info intrinsics, they don't escape or store to allocas.
8073 if (isa(I))
8074 continue;
8075 // This is an unknown instruction. Assume it escapes or writes to all
8076 // static alloca operands.
8077 for (const Use &U : I.operands()) {
8078 if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
8079 *Info = StaticAllocaInfo::Clobbered;
8080 }
8081 continue;
8082 }
8083
8084 // If the stored value is a static alloca, mark it as escaped.
8085 if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
8086 *Info = StaticAllocaInfo::Clobbered;
8087
8088 // Check if the destination is a static alloca.
8089 const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
8090 StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
8091 if (!Info)
8092 continue;
8093 const AllocaInst *AI = cast(Dst);
8094
8095 // Skip allocas that have been initialized or clobbered.
8096 if (*Info != StaticAllocaInfo::Unknown)
8097 continue;
8098
8099 // Check if the stored value is an argument, and that this store fully
8100 // initializes the alloca. Don't elide copies from the same argument twice.
8101 const Value *Val = SI->getValueOperand()->stripPointerCasts();
8102 const auto *Arg = dyn_cast(Val);
8103 if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
8104 Arg->getType()->isEmptyTy() ||
8105 DL.getTypeStoreSize(Arg->getType()) !=
8106 DL.getTypeAllocSize(AI->getAllocatedType()) ||
8107 ArgCopyElisionCandidates.count(Arg)) {
8108 *Info = StaticAllocaInfo::Clobbered;
8109 continue;
8110 }
8111
8112 DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n');
8113
8114 // Mark this alloca and store for argument copy elision.
8115 *Info = StaticAllocaInfo::Elidable;
8116 ArgCopyElisionCandidates.insert({Arg, {AI, SI}});
8117
8118 // Stop scanning if we've seen all arguments. This will happen early in -O0
8119 // builds, which is useful, because -O0 builds have large entry blocks and
8120 // many allocas.
8121 if (ArgCopyElisionCandidates.size() == NumArgs)
8122 break;
8123 }
8124 }
8125
8126 /// Try to elide argument copies from memory into a local alloca. Succeeds if
8127 /// ArgVal is a load from a suitable fixed stack object.
8128 static void tryToElideArgumentCopy(
8129 FunctionLoweringInfo *FuncInfo, SmallVectorImpl &Chains,
8130 DenseMap &ArgCopyElisionFrameIndexMap,
8131 SmallPtrSetImpl &ElidedArgCopyInstrs,
8132 ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
8133 SDValue ArgVal, bool &ArgHasUses) {
8134 // Check if this is a load from a fixed stack object.
8135 auto *LNode = dyn_cast(ArgVal);
8136 if (!LNode)
8137 return;
8138 auto *FINode = dyn_cast(LNode->getBasePtr().getNode());
8139 if (!FINode)
8140 return;
8141
8142 // Check that the fixed stack object is the right size and alignment.
8143 // Look at the alignment that the user wrote on the alloca instead of looking
8144 // at the stack object.
8145 auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
8146 assert(ArgCopyIter != ArgCopyElisionCandidates.end());
8147 const AllocaInst *AI = ArgCopyIter->second.first;
8148 int FixedIndex = FINode->getIndex();
8149 int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
8150 int OldIndex = AllocaIndex;
8151 MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
8152 if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
8153 DEBUG(dbgs() << " argument copy elision failed due to bad fixed stack "
8154 "object size\n");
8155 return;
8156 }
8157 unsigned RequiredAlignment = AI->getAlignment();
8158 if (!RequiredAlignment) {
8159 RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
8160 AI->getAllocatedType());
8161 }
8162 if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
8163 DEBUG(dbgs() << " argument copy elision failed: alignment of alloca "
8164 "greater than stack argument alignment ("
8165 << RequiredAlignment << " vs "
8166 << MFI.getObjectAlignment(FixedIndex) << ")\n");
8167 return;
8168 }
8169
8170 // Perform the elision. Delete the old stack object and replace its only use
8171 // in the variable info map. Mark the stack object as mutable.
8172 DEBUG({
8173 dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
8174 << " Replacing frame index " << OldIndex << " with " << FixedIndex
8175 << '\n';
8176 });
8177 MFI.RemoveStackObject(OldIndex);
8178 MFI.setIsImmutableObjectIndex(FixedIndex, false);
8179 AllocaIndex = FixedIndex;
8180 ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
8181 Chains.push_back(ArgVal.getValue(1));
8182
8183 // Avoid emitting code for the store implementing the copy.
8184 const StoreInst *SI = ArgCopyIter->second.second;
8185 ElidedArgCopyInstrs.insert(SI);
8186
8187 // Check for uses of the argument again so that we can avoid exporting ArgVal
8188 // if it is't used by anything other than the store.
8189 for (const Value *U : Arg.users()) {
8190 if (U != SI) {
8191 ArgHasUses = true;
8192 break;
8193 }
8194 }
8195 }
8196
80308197 void SelectionDAGISel::LowerArguments(const Function &F) {
80318198 SelectionDAG &DAG = SDB->DAG;
80328199 SDLoc dl = SDB->getCurSDLoc();
80488215 ISD::InputArg::NoArgIndex, 0);
80498216 Ins.push_back(RetArg);
80508217 }
8218
8219 // Look for stores of arguments to static allocas. Mark such arguments with a
8220 // flag to ask the target to give us the memory location of that argument if
8221 // available.
8222 ArgCopyElisionMapTy ArgCopyElisionCandidates;
8223 findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);
80518224
80528225 // Set up the incoming argument description vector.
80538226 unsigned Idx = 0;
81268299 if (NeedsRegBlock)
81278300 Flags.setInConsecutiveRegs();
81288301 Flags.setOrigAlign(OriginalAlignment);
8302 if (ArgCopyElisionCandidates.count(&Arg))
8303 Flags.setCopyElisionCandidate();
81298304
81308305 MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
81318306 unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
81988373 ++i;
81998374 }
82008375
8376 SmallVector Chains;
8377 DenseMap ArgCopyElisionFrameIndexMap;
82018378 for (const Argument &Arg : F.args()) {
82028379 ++Idx;
82038380 SmallVector ArgValues;
82048381 SmallVector ValueVTs;
82058382 ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
82068383 unsigned NumValues = ValueVTs.size();
8384 if (NumValues == 0)
8385 continue;
8386
8387 bool ArgHasUses = !Arg.use_empty();
8388
8389 // Elide the copying store if the target loaded this argument from a
8390 // suitable fixed stack object.
8391 if (Ins[i].Flags.isCopyElisionCandidate()) {
8392 tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
8393 ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
8394 InVals[i], ArgHasUses);
8395 }
82078396
82088397 // If this argument is unused then remember its value. It is used to generate
82098398 // debugging information.
82108399 bool isSwiftErrorArg =
82118400 TLI->supportSwiftError() &&
82128401 F.getAttributes().hasAttribute(Idx, Attribute::SwiftError);
8213 if (Arg.use_empty() && NumValues && !isSwiftErrorArg) {
8402 if (!ArgHasUses && !isSwiftErrorArg) {
82148403 SDB->setUnusedArgValue(&Arg, InVals[i]);
82158404
82168405 // Also remember any frame index for use in FastISel.
82278416 // Even an apparant 'unused' swifterror argument needs to be returned. So
82288417 // we do generate a copy for it that can be used on return from the
82298418 // function.
8230 if (!Arg.use_empty() || isSwiftErrorArg) {
8419 if (ArgHasUses || isSwiftErrorArg) {
82318420 Optional AssertOp;
82328421 if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
82338422 AssertOp = ISD::AssertSext;
82348423 else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
82358424 AssertOp = ISD::AssertZext;
82368425
8237 ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
8238 NumParts, PartVT, VT,
8239 nullptr, AssertOp));
8426 ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
8427 PartVT, VT, nullptr, AssertOp));
82408428 }
82418429
82428430 i += NumParts;
82908478 }
82918479 }
82928480
8481 if (!Chains.empty()) {
8482 Chains.push_back(NewRoot);
8483 NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
8484 }
8485
8486 DAG.setRoot(NewRoot);
8487
82938488 assert(i == InVals.size() && "Argument register count mismatch!");
8489
8490 // If any argument copy elisions occurred and we have debug info, update the
8491 // stale frame indices used in the dbg.declare variable info table.
8492 MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
8493 if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
8494 for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
8495 auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
8496 if (I != ArgCopyElisionFrameIndexMap.end())
8497 VI.Slot = I->second;
8498 }
8499 }
82948500
82958501 // Finally, if the target has anything special to do, allow it to do so.
82968502 EmitFunctionEntryCode();
712712 bool &HadTailCall) {
713713 // Lower the instructions. If a call is emitted as a tail call, cease emitting
714714 // nodes for this block.
715 for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I)
716 SDB->visit(*I);
715 for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
716 if (!ElidedArgCopyInstrs.count(&*I))
717 SDB->visit(*I);
718 }
717719
718720 // Make sure the root of the DAG is up-to-date.
719721 CurDAG->setRoot(SDB->getControlRoot());
15631565 const Instruction *Inst = &*std::prev(BI);
15641566
15651567 // If we no longer require this instruction, skip it.
1566 if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
1568 if (isFoldedOrDeadInstruction(Inst, FuncInfo) ||
1569 ElidedArgCopyInstrs.count(Inst)) {
15671570 --NumFastIselRemaining;
15681571 continue;
15691572 }
16931696
16941697 FinishBasicBlock();
16951698 FuncInfo->PHINodesToUpdate.clear();
1699 ElidedArgCopyInstrs.clear();
16961700 }
16971701
16981702 propagateSwiftErrorVRegs(FuncInfo);
26902690 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
26912691 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
26922692 EVT ValVT;
2693 MVT PtrVT = getPointerTy(DAG.getDataLayout());
26932694
26942695 // If value is passed by pointer we have address passed instead of the value
26952696 // itself. No need to extend if the mask value and location share the same
27282729 if (CallConv == CallingConv::X86_INTR) {
27292730 MFI.setObjectOffset(FI, Offset);
27302731 }
2731 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2732 } else {
2733 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
2734 VA.getLocMemOffset(), isImmutable);
2735
2736 // Set SExt or ZExt flag.
2737 if (VA.getLocInfo() == CCValAssign::ZExt) {
2738 MFI.setObjectZExt(FI, true);
2739 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2740 MFI.setObjectSExt(FI, true);
2741 }
2742
2743 // Adjust SP offset of interrupt parameter.
2744 if (CallConv == CallingConv::X86_INTR) {
2745 MFI.setObjectOffset(FI, Offset);
2746 }
2747
2748 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2749 SDValue Val = DAG.getLoad(
2750 ValVT, dl, Chain, FIN,
2751 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2752 return ExtendedInMem ?
2753 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2754 }
2732 return DAG.getFrameIndex(FI, PtrVT);
2733 }
2734
2735 // This is an argument in memory. We might be able to perform copy elision.
2736 if (Flags.isCopyElisionCandidate()) {
2737 EVT ArgVT = Ins[i].ArgVT;
2738 SDValue PartAddr;
2739 if (Ins[i].PartOffset == 0) {
2740 // If this is a one-part value or the first part of a multi-part value,
2741 // create a stack object for the entire argument value type and return a
2742 // load from our portion of it. This assumes that if the first part of an
2743 // argument is in memory, the rest will also be in memory.
2744 int FI = MFI.CreateFixedObject(ArgVT.getSizeInBits() / 8,
2745 VA.getLocMemOffset(), /*Immutable=*/false);
2746 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2747 return DAG.getLoad(
2748 ValVT, dl, Chain, PartAddr,
2749 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2750 } else {
2751 // This is not the first piece of an argument in memory. See if there is
2752 // already a fixed stack object including this offset. If so, assume it
2753 // was created by the PartOffset == 0 branch above and create a load from
2754 // the appropriate offset into it.
2755 int64_t PartBegin = VA.getLocMemOffset();
2756 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2757 int FI = MFI.getObjectIndexBegin();
2758 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2759 int64_t ObjBegin = MFI.getObjectOffset(FI);
2760 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2761 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2762 break;
2763 }
2764 if (MFI.isFixedObjectIndex(FI)) {
2765 SDValue Addr =
2766 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2767 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2768 return DAG.getLoad(
2769 ValVT, dl, Chain, Addr,
2770 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2771 Ins[i].PartOffset));
2772 }
2773 }
2774 }
2775
2776 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2777 VA.getLocMemOffset(), isImmutable);
2778
2779 // Set SExt or ZExt flag.
2780 if (VA.getLocInfo() == CCValAssign::ZExt) {
2781 MFI.setObjectZExt(FI, true);
2782 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2783 MFI.setObjectSExt(FI, true);
2784 }
2785
2786 // Adjust SP offset of interrupt parameter.
2787 if (CallConv == CallingConv::X86_INTR) {
2788 MFI.setObjectOffset(FI, Offset);
2789 }
2790
2791 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2792 SDValue Val = DAG.getLoad(
2793 ValVT, dl, Chain, FIN,
2794 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2795 return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
2796 : Val;
27552797 }
27562798
27572799 // FIXME: Get this from tablegen.
22 ; rdar://13625505
33 ; Here we have 9 fixed integer arguments the 9th argument in on stack, the
44 ; varargs start right after at 8-byte alignment.
5 define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
5 define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
66 ; CHECK-LABEL: fn9:
77 ; 9th fixed argument
88 ; CHECK: ldr {{w[0-9]+}}, [sp, #64]
2929 %a10 = alloca i32, align 4
3030 %a11 = alloca i32, align 4
3131 %a12 = alloca i32, align 4
32 store i32 %a1, i32* %1, align 4
3332 store i32 %a2, i32* %2, align 4
3433 store i32 %a3, i32* %3, align 4
3534 store i32 %a4, i32* %4, align 4
3837 store i32 %a7, i32* %7, align 4
3938 store i32 %a8, i32* %8, align 4
4039 store i32 %a9, i32* %9, align 4
40 store i32 %a9, i32* %a1
4141 %10 = bitcast i8** %args to i8*
4242 call void @llvm.va_start(i8* %10)
4343 %11 = va_arg i8** %args, i32
9292 %10 = load i32, i32* %a10, align 4
9393 %11 = load i32, i32* %a11, align 4
9494 %12 = load i32, i32* %a12, align 4
95 call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
95 call void (i32*, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32* %a1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
9696 ret i32 0
9797 }
9898
0 ; RUN: llc -mtriple=armv7-linux < %s | FileCheck %s
1
2 declare arm_aapcscc void @addrof_i32(i32*)
3 declare arm_aapcscc void @addrof_i64(i64*)
4
5 define arm_aapcscc void @simple(i32, i32, i32, i32, i32 %x) {
6 entry:
7 %x.addr = alloca i32
8 store i32 %x, i32* %x.addr
9 call void @addrof_i32(i32* %x.addr)
10 ret void
11 }
12
13 ; CHECK-LABEL: simple:
14 ; CHECK: push {r11, lr}
15 ; CHECK: add r0, sp, #8
16 ; CHECK: bl addrof_i32
17 ; CHECK: pop {r11, pc}
18
19
20 ; We need to load %x before calling addrof_i32 now because it could mutate %x in
21 ; place.
22
23 define arm_aapcscc i32 @use_arg(i32, i32, i32, i32, i32 %x) {
24 entry:
25 %x.addr = alloca i32
26 store i32 %x, i32* %x.addr
27 call void @addrof_i32(i32* %x.addr)
28 ret i32 %x
29 }
30
31 ; CHECK-LABEL: use_arg:
32 ; CHECK: push {[[csr:[^ ]*]], lr}
33 ; CHECK: ldr [[csr]], [sp, #8]
34 ; CHECK: add r0, sp, #8
35 ; CHECK: bl addrof_i32
36 ; CHECK: mov r0, [[csr]]
37 ; CHECK: pop {[[csr]], pc}
38
39
40 define arm_aapcscc i64 @split_i64(i32, i32, i32, i32, i64 %x) {
41 entry:
42 %x.addr = alloca i64, align 4
43 store i64 %x, i64* %x.addr, align 4
44 call void @addrof_i64(i64* %x.addr)
45 ret i64 %x
46 }
47
48 ; CHECK-LABEL: split_i64:
49 ; CHECK: push {r4, r5, r11, lr}
50 ; CHECK: sub sp, sp, #8
51 ; CHECK: ldr r4, [sp, #28]
52 ; CHECK: ldr r5, [sp, #24]
53 ; CHECK: mov r0, sp
54 ; CHECK: str r4, [sp, #4]
55 ; CHECK: str r5, [sp]
56 ; CHECK: bl addrof_i64
57 ; CHECK: mov r0, r5
58 ; CHECK: mov r1, r4
59 ; CHECK: add sp, sp, #8
60 ; CHECK: pop {r4, r5, r11, pc}
235235 ret i32 %tmp
236236
237237 ; CHECK-LABEL: va9:
238 ; CHECK: addiu $sp, $sp, -32
239 ; CHECK: lw $2, 52($sp)
238 ; CHECK: addiu $sp, $sp, -24
239 ; CHECK: lw $2, 44($sp)
240240 }
241241
242242 ; double
77 @.str = internal constant [4 x i8] c"%p\0A\00" ; <[4 x i8]*> [#uses=1]
88 @llvm.used = appending global [1 x i8*] [i8* bitcast (i8* (%struct.S*, i32, %struct.S*)* @_Z4test1SiS_ to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
99
10 ; Verify that %esi gets spilled before the call.
10 ; Verify that %s1 gets spilled before the call.
1111 ; CHECK: Z4test1SiS
12 ; CHECK: movl %esi,{{.*}}(%ebp)
12 ; CHECK: leal 8(%ebp), %[[reg:[^ ]*]]
13 ; CHECK: movl %[[reg]],{{.*}}(%ebp) ## 4-byte Spill
1314 ; CHECK: calll __Z6throwsv
1415
1516 define i8* @_Z4test1SiS_(%struct.S* byval %s1, i32 %n, %struct.S* byval %s2) ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
0 ; RUN: llc -mtriple=i686-windows < %s | FileCheck %s
1
2 declare void @addrof_i32(i32*)
3 declare void @addrof_i64(i64*)
4 declare void @addrof_i128(i128*)
5 declare void @addrof_i32_x3(i32*, i32*, i32*)
6
7 define void @simple(i32 %x) {
8 entry:
9 %x.addr = alloca i32
10 store i32 %x, i32* %x.addr
11 call void @addrof_i32(i32* %x.addr)
12 ret void
13 }
14
15 ; CHECK-LABEL: _simple:
16 ; CHECK: leal 4(%esp), %[[reg:[^ ]*]]
17 ; CHECK: pushl %[[reg]]
18 ; CHECK: calll _addrof_i32
19 ; CHECK: retl
20
21
22 ; We need to load %x before calling addrof_i32 now because it could mutate %x in
23 ; place.
24
25 define i32 @use_arg(i32 %x) {
26 entry:
27 %x.addr = alloca i32
28 store i32 %x, i32* %x.addr
29 call void @addrof_i32(i32* %x.addr)
30 ret i32 %x
31 }
32
33 ; CHECK-LABEL: _use_arg:
34 ; CHECK: pushl %[[csr:[^ ]*]]
35 ; CHECK-DAG: movl 8(%esp), %[[csr]]
36 ; CHECK-DAG: leal 8(%esp), %[[reg:[^ ]*]]
37 ; CHECK: pushl %[[reg]]
38 ; CHECK: calll _addrof_i32
39 ; CHECK: movl %[[csr]], %eax
40 ; CHECK: popl %[[csr]]
41 ; CHECK: retl
42
43
44 define i64 @split_i64(i64 %x) {
45 entry:
46 %x.addr = alloca i64, align 4
47 store i64 %x, i64* %x.addr, align 4
48 call void @addrof_i64(i64* %x.addr)
49 ret i64 %x
50 }
51
52 ; CHECK-LABEL: _split_i64:
53 ; CHECK: pushl %ebp
54 ; CHECK: movl %esp, %ebp
55 ; CHECK: pushl %[[csr2:[^ ]*]]
56 ; CHECK: pushl %[[csr1:[^ ]*]]
57 ; CHECK: andl $-8, %esp
58 ; CHECK-DAG: movl 8(%ebp), %[[csr1]]
59 ; CHECK-DAG: movl 12(%ebp), %[[csr2]]
60 ; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]]
61 ; CHECK: pushl %[[reg]]
62 ; CHECK: calll _addrof_i64
63 ; CHECK-DAG: movl %[[csr1]], %eax
64 ; CHECK-DAG: movl %[[csr2]], %edx
65 ; CHECK: leal -8(%ebp), %esp
66 ; CHECK: popl %[[csr1]]
67 ; CHECK: popl %[[csr2]]
68 ; CHECK: popl %ebp
69 ; CHECK: retl
70
71
72 ; We can't copy elide when an i64 is split between registers and memory in a
73 ; fastcc function.
74
75 define fastcc i64 @fastcc_split_i64(i64* %p, i64 %x) {
76 entry:
77 %x.addr = alloca i64, align 4
78 store i64 %x, i64* %x.addr, align 4
79 call void @addrof_i64(i64* %x.addr)
80 ret i64 %x
81 }
82
83 ; CHECK-LABEL: _fastcc_split_i64:
84 ; CHECK: pushl %ebp
85 ; CHECK: movl %esp, %ebp
86 ; CHECK-DAG: movl %edx, %[[r1:[^ ]*]]
87 ; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]]
88 ; CHECK-DAG: movl %[[r2]], 4(%esp)
89 ; CHECK-DAG: movl %[[r1]], (%esp)
90 ; CHECK: movl %esp, %[[reg:[^ ]*]]
91 ; CHECK: pushl %[[reg]]
92 ; CHECK: calll _addrof_i64
93 ; CHECK: popl %ebp
94 ; CHECK: retl
95
96
97 ; We can't copy elide when it would reduce the user requested alignment.
98
99 define void @high_alignment(i32 %x) {
100 entry:
101 %x.p = alloca i32, align 128
102 store i32 %x, i32* %x.p
103 call void @addrof_i32(i32* %x.p)
104 ret void
105 }
106
107 ; CHECK-LABEL: _high_alignment:
108 ; CHECK: andl $-128, %esp
109 ; CHECK: movl 8(%ebp), %[[reg:[^ ]*]]
110 ; CHECK: movl %[[reg]], (%esp)
111 ; CHECK: movl %esp, %[[reg:[^ ]*]]
112 ; CHECK: pushl %[[reg]]
113 ; CHECK: calll _addrof_i32
114 ; CHECK: retl
115
116
117 ; We can't copy elide when it would reduce the ABI required alignment.
118 ; FIXME: We should lower the ABI alignment of i64 on Windows, since MSVC
119 ; doesn't guarantee it.
120
121 define void @abi_alignment(i64 %x) {
122 entry:
123 %x.p = alloca i64
124 store i64 %x, i64* %x.p
125 call void @addrof_i64(i64* %x.p)
126 ret void
127 }
128
129 ; CHECK-LABEL: _abi_alignment:
130 ; CHECK: andl $-8, %esp
131 ; CHECK: movl 8(%ebp), %[[reg:[^ ]*]]
132 ; CHECK: movl %[[reg]], (%esp)
133 ; CHECK: movl %esp, %[[reg:[^ ]*]]
134 ; CHECK: pushl %[[reg]]
135 ; CHECK: calll _addrof_i64
136 ; CHECK: retl
137
138
139 ; The code we generate for this is unimportant. This is mostly a crash test.
140
141 define void @split_i128(i128* %sret, i128 %x) {
142 entry:
143 %x.addr = alloca i128
144 store i128 %x, i128* %x.addr
145 call void @addrof_i128(i128* %x.addr)
146 store i128 %x, i128* %sret
147 ret void
148 }
149
150 ; CHECK-LABEL: _split_i128:
151 ; CHECK: pushl %ebp
152 ; CHECK: calll _addrof_i128
153 ; CHECK: retl
154
155
156 ; Check that we load all of x, y, and z before the call.
157
158 define i32 @three_args(i32 %x, i32 %y, i32 %z) {
159 entry:
160 %z.addr = alloca i32, align 4
161 %y.addr = alloca i32, align 4
162 %x.addr = alloca i32, align 4
163 store i32 %z, i32* %z.addr, align 4
164 store i32 %y, i32* %y.addr, align 4
165 store i32 %x, i32* %x.addr, align 4
166 call void @addrof_i32_x3(i32* %x.addr, i32* %y.addr, i32* %z.addr)
167 %s1 = add i32 %x, %y
168 %sum = add i32 %s1, %z
169 ret i32 %sum
170 }
171
172 ; CHECK-LABEL: _three_args:
173 ; CHECK: pushl %[[csr:[^ ]*]]
174 ; CHECK-DAG: movl {{[0-9]+}}(%esp), %[[csr]]
175 ; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]]
176 ; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]]
177 ; CHECK-DAG: leal 8(%esp), %[[x:[^ ]*]]
178 ; CHECK-DAG: leal 12(%esp), %[[y:[^ ]*]]
179 ; CHECK-DAG: leal 16(%esp), %[[z:[^ ]*]]
180 ; CHECK: pushl %[[z]]
181 ; CHECK: pushl %[[y]]
182 ; CHECK: pushl %[[x]]
183 ; CHECK: calll _addrof_i32_x3
184 ; CHECK: movl %[[csr]], %eax
185 ; CHECK: popl %[[csr]]
186 ; CHECK: retl
187
188
189 define void @two_args_same_alloca(i32 %x, i32 %y) {
190 entry:
191 %x.addr = alloca i32
192 store i32 %x, i32* %x.addr
193 store i32 %y, i32* %x.addr
194 call void @addrof_i32(i32* %x.addr)
195 ret void
196 }
197
198 ; CHECK-LABEL: _two_args_same_alloca:
199 ; CHECK: movl 8(%esp), {{.*}}
200 ; CHECK: movl {{.*}}, 4(%esp)
201 ; CHECK: leal 4(%esp), %[[reg:[^ ]*]]
202 ; CHECK: pushl %[[reg]]
203 ; CHECK: calll _addrof_i32
204 ; CHECK: retl
205
206
207 define void @avoid_byval(i32* byval %x) {
208 entry:
209 %x.p.p = alloca i32*
210 store i32* %x, i32** %x.p.p
211 call void @addrof_i32(i32* %x)
212 ret void
213 }
214
215 ; CHECK-LABEL: _avoid_byval:
216 ; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
217 ; CHECK: pushl %[[reg]]
218 ; CHECK: calll _addrof_i32
219 ; CHECK: retl
220
221
222 define void @avoid_inalloca(i32* inalloca %x) {
223 entry:
224 %x.p.p = alloca i32*
225 store i32* %x, i32** %x.p.p
226 call void @addrof_i32(i32* %x)
227 ret void
228 }
229
230 ; CHECK-LABEL: _avoid_inalloca:
231 ; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
232 ; CHECK: pushl %[[reg]]
233 ; CHECK: calll _addrof_i32
234 ; CHECK: retl
235
236
237 ; Don't elide the copy when the alloca is escaped with a store.
238
239 define void @escape_with_store(i32 %x) {
240 %x1 = alloca i32
241 %x2 = alloca i32*
242 store i32* %x1, i32** %x2
243 %x3 = load i32*, i32** %x2
244 store i32 0, i32* %x3
245 store i32 %x, i32* %x1
246 call void @addrof_i32(i32* %x1)
247 ret void
248 }
249
250 ; CHECK-LABEL: _escape_with_store:
251 ; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]]
252 ; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp)
253 ; CHECK: movl %[[reg]], [[offs]](%esp)
254 ; CHECK: calll _addrof_i32
255
256
257 ; This test case exposed issues with the use of TokenFactor.
258
259 define void @sret_and_elide(i32* sret %sret, i32 %v) {
260 %v.p = alloca i32
261 store i32 %v, i32* %v.p
262 call void @addrof_i32(i32* %v.p)
263 store i32 %v, i32* %sret
264 ret void
265 }
266
267 ; CHECK-LABEL: _sret_and_elide:
268 ; CHECK: pushl
269 ; CHECK: pushl
270 ; CHECK: movl 12(%esp), %[[sret:[^ ]*]]
271 ; CHECK: movl 16(%esp), %[[v:[^ ]*]]
272 ; CHECK: leal 16(%esp), %[[reg:[^ ]*]]
273 ; CHECK: pushl %[[reg]]
274 ; CHECK: calll _addrof_i32
275 ; CHECK: movl %[[v]], (%[[sret]])
276 ; CHECK: movl %[[sret]], %eax
277 ; CHECK: popl
278 ; CHECK: popl
279 ; CHECK: retl
0 ; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic -no-integrated-as | FileCheck %s
11 ; rdar://6992609
22
3 ; CHECK: movl %ecx, 4([[ESP:%e..]])
4 ; CHECK: movl 4([[ESP]]), [[EDX:%e..]]
5 ; CHECK: movl [[EDX]], 4([[ESP]])
63 target triple = "i386-apple-darwin9.0"
7 @llvm.used = appending global [1 x i8*] [i8* bitcast (i64 (i64)* @_OSSwapInt64 to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
84
95 define i64 @_OSSwapInt64(i64 %_data) nounwind {
106 entry:
11 %retval = alloca i64 ; [#uses=2]
12 %_data.addr = alloca i64 ; [#uses=4]
13 store i64 %_data, i64* %_data.addr
14 %tmp = load i64, i64* %_data.addr ; [#uses=1]
15 %0 = call i64 asm "bswap %eax\0A\09bswap %edx\0A\09xchgl %eax, %edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %tmp) nounwind ; [#uses=1]
16 store i64 %0, i64* %_data.addr
17 %tmp1 = load i64, i64* %_data.addr ; [#uses=1]
18 store i64 %tmp1, i64* %retval
19 %1 = load i64, i64* %retval ; [#uses=1]
20 ret i64 %1
7 %0 = call i64 asm "bswap %eax\0A\09bswap %edx\0A\09xchgl %eax, %%edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %_data) nounwind
8 ret i64 %0
219 }
10
11 ; CHECK-LABEL: __OSSwapInt64:
12 ; CHECK-DAG: movl 8(%esp), %edx
13 ; CHECK-DAG: movl 4(%esp), %eax
14 ; CHECK: ## InlineAsm Start
15 ; CHECK: ## InlineAsm End
16 ; Everything is set up in EAX:EDX, return immediately.
17 ; CHECK-NEXT: retl
2218
2319 ; The tied operands are not necessarily in the same order as the defs.
2420 ; PR13742
2521 define i64 @swapped(i64 %x, i64 %y) nounwind {
2622 entry:
27 %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
28 %x1 = extractvalue { i64, i64 } %x0, 0
29 ret i64 %x1
23 %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
24 %x1 = extractvalue { i64, i64 } %x0, 0
25 ret i64 %x1
3026 }
2929 ; CHECK-NEXT: vmovss %xmm5, {{[0-9]+}}(%rsp)
3030 ; CHECK-NEXT: vmovss %xmm6, {{[0-9]+}}(%rsp)
3131 ; CHECK-NEXT: vmovss %xmm7, {{[0-9]+}}(%rsp)
32 ; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp)
33 ; CHECK-NEXT: vmovss %xmm14, {{[0-9]+}}(%rsp)
34 ; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp)
35 ; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp)
36 ; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp)
37 ; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp)
38 ; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp)
39 ; CHECK-NEXT: vmovss %xmm8, (%rsp)
4032 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
4133 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
4234 ; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
4537 ; CHECK-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
4638 ; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
4739 ; CHECK-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
48 ; CHECK-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
49 ; CHECK-NEXT: vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
50 ; CHECK-NEXT: vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
51 ; CHECK-NEXT: vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero
52 ; CHECK-NEXT: vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero
53 ; CHECK-NEXT: vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero
54 ; CHECK-NEXT: vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero
55 ; CHECK-NEXT: vmovss {{.*#+}} xmm15 = mem[0],zero,zero,zero
40 ; CHECK-NEXT: vmovss {{.*#+}} xmm16 = mem[0],zero,zero,zero
41 ; CHECK-NEXT: vmovss {{.*#+}} xmm17 = mem[0],zero,zero,zero
42 ; CHECK-NEXT: vmovss {{.*#+}} xmm18 = mem[0],zero,zero,zero
43 ; CHECK-NEXT: vmovss {{.*#+}} xmm19 = mem[0],zero,zero,zero
44 ; CHECK-NEXT: vmovss {{.*#+}} xmm20 = mem[0],zero,zero,zero
45 ; CHECK-NEXT: vmovss {{.*#+}} xmm21 = mem[0],zero,zero,zero
46 ; CHECK-NEXT: vmovss {{.*#+}} xmm22 = mem[0],zero,zero,zero
47 ; CHECK-NEXT: vmovss {{.*#+}} xmm23 = mem[0],zero,zero,zero
5648 ; CHECK-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp)
5749 ; CHECK-NEXT: vmovss %xmm1, {{[0-9]+}}(%rsp)
5850 ; CHECK-NEXT: vmovss %xmm2, {{[0-9]+}}(%rsp)
6153 ; CHECK-NEXT: vmovss %xmm5, {{[0-9]+}}(%rsp)
6254 ; CHECK-NEXT: vmovss %xmm6, {{[0-9]+}}(%rsp)
6355 ; CHECK-NEXT: vmovss %xmm7, {{[0-9]+}}(%rsp)
64 ; CHECK-NEXT: vmovss %xmm8, {{[0-9]+}}(%rsp)
65 ; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp)
66 ; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp)
67 ; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp)
68 ; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp)
69 ; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp)
70 ; CHECK-NEXT: vmovss %xmm14, {{[0-9]+}}(%rsp)
71 ; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp)
56 ; CHECK-NEXT: vmovss %xmm16, {{[0-9]+}}(%rsp)
57 ; CHECK-NEXT: vmovss %xmm17, {{[0-9]+}}(%rsp)
58 ; CHECK-NEXT: vmovss %xmm18, {{[0-9]+}}(%rsp)
59 ; CHECK-NEXT: vmovss %xmm19, {{[0-9]+}}(%rsp)
60 ; CHECK-NEXT: vmovss %xmm20, {{[0-9]+}}(%rsp)
61 ; CHECK-NEXT: vmovss %xmm21, {{[0-9]+}}(%rsp)
62 ; CHECK-NEXT: vmovss %xmm22, {{[0-9]+}}(%rsp)
63 ; CHECK-NEXT: vmovss %xmm23, {{[0-9]+}}(%rsp)
7264 ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
7365 ; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
7466 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
10395 ; CHECK-NEXT: # implicit-def: %YMM3
10496 ; CHECK-NEXT: vmovaps %xmm1, %xmm3
10597 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3
106 ; CHECK-NEXT: # implicit-def: %ZMM16
107 ; CHECK-NEXT: vmovaps %zmm3, %zmm16
108 ; CHECK-NEXT: vinsertf64x4 $1, %ymm2, %zmm16, %zmm16
109 ; CHECK-NEXT: vmovaps %zmm16, {{[0-9]+}}(%rsp)
98 ; CHECK-NEXT: # implicit-def: %ZMM24
99 ; CHECK-NEXT: vmovaps %zmm3, %zmm24
100 ; CHECK-NEXT: vinsertf64x4 $1, %ymm2, %zmm24, %zmm24
101 ; CHECK-NEXT: vmovaps %zmm24, {{[0-9]+}}(%rsp)
110102 ; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0
103 ; CHECK-NEXT: vmovss %xmm15, {{[0-9]+}}(%rsp) # 4-byte Spill
104 ; CHECK-NEXT: vmovss %xmm8, {{[0-9]+}}(%rsp) # 4-byte Spill
105 ; CHECK-NEXT: vmovss %xmm9, {{[0-9]+}}(%rsp) # 4-byte Spill
106 ; CHECK-NEXT: vmovss %xmm10, {{[0-9]+}}(%rsp) # 4-byte Spill
107 ; CHECK-NEXT: vmovss %xmm11, {{[0-9]+}}(%rsp) # 4-byte Spill
108 ; CHECK-NEXT: vmovss %xmm12, {{[0-9]+}}(%rsp) # 4-byte Spill
109 ; CHECK-NEXT: vmovss %xmm13, {{[0-9]+}}(%rsp) # 4-byte Spill
110 ; CHECK-NEXT: vmovss %xmm14, (%rsp) # 4-byte Spill
111111 ; CHECK-NEXT: movq %rbp, %rsp
112112 ; CHECK-NEXT: popq %rbp
113113 ; CHECK-NEXT: retq
16521652 define void @test_mm_setcsr(i32 %a0) nounwind {
16531653 ; X32-LABEL: test_mm_setcsr:
16541654 ; X32: # BB#0:
1655 ; X32-NEXT: pushl %eax
1656 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
1657 ; X32-NEXT: movl %esp, %ecx
1658 ; X32-NEXT: movl %eax, (%esp)
1659 ; X32-NEXT: ldmxcsr (%ecx)
1660 ; X32-NEXT: popl %eax
1655 ; X32-NEXT: leal 4(%esp), %eax
1656 ; X32-NEXT: ldmxcsr (%eax)
16611657 ; X32-NEXT: retl
16621658 ;
16631659 ; X64-LABEL: test_mm_setcsr:
5858
5959 ; CHECK: Address Line Column File ISA Discriminator Flags
6060 ; CHECK: ------------------ ------ ------ ------ --- ------------- -------------
61 ; CHECK: 0x0000000000000011 2 0 1 0 42 {{$}}
61 ; CHECK: 0x000000000000000a 2 0 1 0 42 {{$}}