llvm.org GIT mirror llvm / 3bbf394
In visitSTORE, always use FindBetterChain, rather than only when UseAA is enabled. Recommiting with compiler time improvements Recommitting after fixup of 32-bit aliasing sign offset bug in DAGCombiner. * Simplify Consecutive Merge Store Candidate Search Now that address aliasing is much less conservative, push through simplified store merging search and chain alias analysis which only checks for parallel stores through the chain subgraph. This is cleaner as the separation of non-interfering loads/stores from the store-merging logic. When merging stores search up the chain through a single load, and finds all possible stores by looking down from through a load and a TokenFactor to all stores visited. This improves the quality of the output SelectionDAG and the output Codegen (save perhaps for some ARM cases where we correctly constructs wider loads, but then promotes them to float operations which appear but requires more expensive constant generation). Some minor peephole optimizations to deal with improved SubDAG shapes (listed below) Additional Minor Changes: 1. Finishes removing unused AliasLoad code 2. Unifies the chain aggregation in the merged stores across code paths 3. Re-add the Store node to the worklist after calling SimplifyDemandedBits. 4. Increase GatherAllAliasesMaxDepth from 6 to 18. That number is arbitrary, but seems sufficient to not cause regressions in tests. 5. Remove Chain dependencies of Memory operations on CopyfromReg nodes as these are captured by data dependence 6. Forward loads-store values through tokenfactors containing {CopyToReg,CopyFromReg} Values. 7. Peephole to convert buildvector of extract_vector_elt to extract_subvector if possible (see CodeGen/AArch64/store-merge.ll) 8. Store merging for the ARM target is restricted to 32-bit as some in some contexts invalid 64-bit operations are being generated. This can be removed once appropriate checks are added. This finishes the change Matt Arsenault started in r246307 and jyknight's original patch. Many tests required some changes as memory operations are now reorderable, improving load-store forwarding. One test in particular is worth noting: CodeGen/PowerPC/ppc64-align-long-double.ll - Improved load-store forwarding converts a load-store pair into a parallel store and a memory-realized bitcast of the same value. However, because we lose the sharing of the explicit and implicit store values we must create another local store. A similar transformation happens before SelectionDAG as well. Reviewers: arsenm, hfinkel, tstellarAMD, jyknight, nhaehnle git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@297695 91177308-0d34-0410-b5e6-96231b3b80d8 Nirav Dave 3 years ago
75 changed file(s) with 4173 addition(s) and 4184 deletion(s). Raw diff Collapse all Expand all
361361 unsigned AddrSpace) const {
362362 return false;
363363 }
364
365 /// Returns if it's reasonable to merge stores to MemVT size.
366 virtual bool canMergeStoresTo(EVT MemVT) const { return true; }
364367
365368 /// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
366369 virtual bool isCheapToSpeculateCttz() const {
5252
5353 namespace {
5454 static cl::opt
55 CombinerAA("combiner-alias-analysis", cl::Hidden,
56 cl::desc("Enable DAG combiner alias-analysis heuristics"));
57
58 static cl::opt
5955 CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
6056 cl::desc("Enable DAG combiner's use of IR alias analysis"));
6157
132128 /// Add to the worklist making sure its instance is at the back (next to be
133129 /// processed.)
134130 void AddToWorklist(SDNode *N) {
131 assert(N->getOpcode() != ISD::DELETED_NODE &&
132 "Deleted Node added to Worklist");
133
135134 // Skip handle nodes as they can't usefully be combined and confuse the
136135 // zero-use deletion strategy.
137136 if (N->getOpcode() == ISD::HANDLENODE)
176175 void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
177176
178177 private:
178 unsigned MaximumLegalStoreInBits;
179179
180180 /// Check the specified integer node value to see if it can be simplified or
181181 /// if things it uses can be simplified by bit propagation.
421421 /// Holds a pointer to an LSBaseSDNode as well as information on where it
422422 /// is located in a sequence of memory operations connected by a chain.
423423 struct MemOpLink {
424 MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
425 MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
424 MemOpLink(LSBaseSDNode *N, int64_t Offset)
425 : MemNode(N), OffsetFromBase(Offset) {}
426426 // Ptr to the mem node.
427427 LSBaseSDNode *MemNode;
428428 // Offset from the base ptr.
429429 int64_t OffsetFromBase;
430 // What is the sequence number of this mem node.
431 // Lowest mem operand in the DAG starts at zero.
432 unsigned SequenceNum;
433430 };
434431
435432 /// This is a helper function for visitMUL to check the profitability
440437 SDValue &AddNode,
441438 SDValue &ConstNode);
442439
443 /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a
444 /// constant build_vector of the stored constant values in Stores.
445 SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL,
446 ArrayRef Stores,
447 SmallVectorImpl &Chains,
448 EVT Ty) const;
449440
450441 /// This is a helper function for visitAND and visitZERO_EXTEND. Returns
451442 /// true if the (and (load x) c) pattern matches an extload. ExtVT returns
459450 /// This is a helper function for MergeConsecutiveStores. When the source
460451 /// elements of the consecutive stores are all constants or all extracted
461452 /// vector elements, try to merge them into one larger store.
462 /// \return number of stores that were merged into a merged store (always
463 /// a prefix of \p StoreNode).
464 bool MergeStoresOfConstantsOrVecElts(
465 SmallVectorImpl &StoreNodes, EVT MemVT, unsigned NumStores,
466 bool IsConstantSrc, bool UseVector);
453 /// \return True if a merged store was created.
454 bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl &StoreNodes,
455 EVT MemVT, unsigned NumStores,
456 bool IsConstantSrc, bool UseVector);
467457
468458 /// This is a helper function for MergeConsecutiveStores.
469459 /// Stores that may be merged are placed in StoreNodes.
470 /// Loads that may alias with those stores are placed in AliasLoadNodes.
471 void getStoreMergeAndAliasCandidates(
472 StoreSDNode* St, SmallVectorImpl &StoreNodes,
473 SmallVectorImpl &AliasLoadNodes);
460 void getStoreMergeCandidates(StoreSDNode *St,
461 SmallVectorImpl &StoreNodes);
474462
475463 /// Helper function for MergeConsecutiveStores. Checks if
476464 /// Candidate stores have indirect dependency through their
482470 /// This optimization uses wide integers or vectors when possible.
483471 /// \return number of stores that were merged into a merged store (the
484472 /// affected nodes are stored as a prefix in \p StoreNodes).
485 bool MergeConsecutiveStores(StoreSDNode *N,
486 SmallVectorImpl &StoreNodes);
473 bool MergeConsecutiveStores(StoreSDNode *N);
487474
488475 /// \brief Try to transform a truncation where C is a constant:
489476 /// (trunc (and X, C)) -> (and (trunc X), (trunc C))
498485 : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
499486 OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
500487 ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
488
489 MaximumLegalStoreInBits = 0;
490 for (MVT VT : MVT::all_valuetypes())
491 if (EVT(VT).isSimple() && VT != MVT::Other &&
492 TLI.isTypeLegal(EVT(VT)) &&
493 VT.getSizeInBits() >= MaximumLegalStoreInBits)
494 MaximumLegalStoreInBits = VT.getSizeInBits();
501495 }
502496
503497 /// Runs the dag combiner on all nodes in the work list
15881582 }
15891583
15901584 SmallVector TFs; // List of token factors to visit.
1591 SmallVector Ops; // Ops for replacing token factor.
1585 SmallVector Ops; // Ops for replacing token factor.
15921586 SmallPtrSet SeenOps;
15931587 bool Changed = false; // If we should replace this token factor.
15941588
16321626 }
16331627 }
16341628
1629 // Remove Nodes that are chained to another node in the list. Do so
1630 // by walking up chains breath-first stopping when we've seen
1631 // another operand. In general we must climb to the EntryNode, but we can exit
1632 // early if we find all remaining work is associated with just one operand as
1633 // no further pruning is possible.
1634
1635 // List of nodes to search through and original Ops from which they originate.
1636 SmallVector, 8> Worklist;
1637 SmallVector OpWorkCount; // Count of work for each Op.
1638 SmallPtrSet SeenChains;
1639 bool DidPruneOps = false;
1640
1641 unsigned NumLeftToConsider = 0;
1642 for (const SDValue &Op : Ops) {
1643 Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
1644 OpWorkCount.push_back(1);
1645 }
1646
1647 auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
1648 // If this is an Op, we can remove the op from the list. Remark any
1649 // search associated with it as from the current OpNumber.
1650 if (SeenOps.count(Op) != 0) {
1651 Changed = true;
1652 DidPruneOps = true;
1653 unsigned OrigOpNumber = 0;
1654 while (Ops[OrigOpNumber].getNode() != Op && OrigOpNumber < Ops.size())
1655 OrigOpNumber++;
1656 assert((OrigOpNumber != Ops.size()) &&
1657 "expected to find TokenFactor Operand");
1658 // Re-mark worklist from OrigOpNumber to OpNumber
1659 for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
1660 if (Worklist[i].second == OrigOpNumber) {
1661 Worklist[i].second = OpNumber;
1662 }
1663 }
1664 OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
1665 OpWorkCount[OrigOpNumber] = 0;
1666 NumLeftToConsider--;
1667 }
1668 // Add if it's a new chain
1669 if (SeenChains.insert(Op).second) {
1670 OpWorkCount[OpNumber]++;
1671 Worklist.push_back(std::make_pair(Op, OpNumber));
1672 }
1673 };
1674
1675 for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
1676 // We need at least be consider at least 2 Ops to prune.
1677 if (NumLeftToConsider <= 1)
1678 break;
1679 auto CurNode = Worklist[i].first;
1680 auto CurOpNumber = Worklist[i].second;
1681 assert((OpWorkCount[CurOpNumber] > 0) &&
1682 "Node should not appear in worklist");
1683 switch (CurNode->getOpcode()) {
1684 case ISD::EntryToken:
1685 // Hitting EntryToken is the only way for the search to terminate without
1686 // hitting
1687 // another operand's search. Prevent us from marking this operand
1688 // considered.
1689 NumLeftToConsider++;
1690 break;
1691 case ISD::TokenFactor:
1692 for (const SDValue &Op : CurNode->op_values())
1693 AddToWorklist(i, Op.getNode(), CurOpNumber);
1694 break;
1695 case ISD::CopyFromReg:
1696 case ISD::CopyToReg:
1697 AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
1698 break;
1699 default:
1700 if (auto *MemNode = dyn_cast(CurNode))
1701 AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
1702 break;
1703 }
1704 OpWorkCount[CurOpNumber]--;
1705 if (OpWorkCount[CurOpNumber] == 0)
1706 NumLeftToConsider--;
1707 }
1708
16351709 SDValue Result;
16361710
16371711 // If we've changed things around then replace token factor.
16401714 // The entry token is the only possible outcome.
16411715 Result = DAG.getEntryNode();
16421716 } else {
1643 // New and improved token factor.
1644 Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
1645 }
1646
1647 // Add users to worklist if AA is enabled, since it may introduce
1648 // a lot of new chained token factors while removing memory deps.
1649 bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
1650 : DAG.getSubtarget().useAA();
1651 return CombineTo(N, Result, UseAA /*add to worklist*/);
1717 if (DidPruneOps) {
1718 SmallVector PrunedOps;
1719 //
1720 for (const SDValue &Op : Ops) {
1721 if (SeenChains.count(Op.getNode()) == 0)
1722 PrunedOps.push_back(Op);
1723 }
1724 Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps);
1725 } else {
1726 Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
1727 }
1728 }
1729
1730 // Add users to worklist, since we may introduce a lot of new
1731 // chained token factors while removing memory deps.
1732 return CombineTo(N, Result, true /*add to worklist*/);
16521733 }
16531734
16541735 return Result;
67916872 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
67926873 SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
67936874
6875 // Simplify TF.
6876 AddToWorklist(NewChain.getNode());
6877
67946878 CombineTo(N, NewValue);
67956879
67966880 // Replace uses of the original load (before extension)
1094611030 dbgs() << "\n");
1094711031 WorklistRemover DeadNodes(*this);
1094811032 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
10949
11033 AddUsersToWorklist(Chain.getNode());
1095011034 if (N->use_empty())
1095111035 deleteAndRecombine(N);
1095211036
1099911083 StoreSDNode *PrevST = cast(Chain);
1100011084 if (PrevST->getBasePtr() == Ptr &&
1100111085 PrevST->getValue().getValueType() == N->getValueType(0))
11002 return CombineTo(N, Chain.getOperand(1), Chain);
11086 return CombineTo(N, PrevST->getOperand(1), Chain);
1100311087 }
1100411088 }
1100511089
1101711101 }
1101811102 }
1101911103
11020 bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
11021 : DAG.getSubtarget().useAA();
11022 #ifndef NDEBUG
11023 if (CombinerAAOnlyFunc.getNumOccurrences() &&
11024 CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
11025 UseAA = false;
11026 #endif
11027 if (UseAA && LD->isUnindexed()) {
11104 if (LD->isUnindexed()) {
1102811105 // Walk up chain skipping non-aliasing memory nodes.
1102911106 SDValue BetterChain = FindBetterChain(N, Chain);
1103011107
1160611683 SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
1160711684 ArgChains);
1160811685 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
11686 AddToWorklist(Chain.getNode());
1160911687 return true;
1161011688 }
1161111689
1199912077 return false;
1200012078 }
1200112079
12002 SDValue DAGCombiner::getMergedConstantVectorStore(
12003 SelectionDAG &DAG, const SDLoc &SL, ArrayRef Stores,
12004 SmallVectorImpl &Chains, EVT Ty) const {
12005 SmallVector BuildVector;
12006
12007 for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
12008 StoreSDNode *St = cast(Stores[I].MemNode);
12009 Chains.push_back(St->getChain());
12010 BuildVector.push_back(St->getValue());
12011 }
12012
12013 return DAG.getBuildVector(Ty, SL, BuildVector);
12014 }
12015
1201612080 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
1201712081 SmallVectorImpl &StoreNodes, EVT MemVT,
1201812082 unsigned NumStores, bool IsConstantSrc, bool UseVector) {
1202112085 return false;
1202212086
1202312087 int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
12024 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
12025 unsigned LatestNodeUsed = 0;
12026
12027 for (unsigned i=0; i < NumStores; ++i) {
12028 // Find a chain for the new wide-store operand. Notice that some
12029 // of the store nodes that we found may not be selected for inclusion
12030 // in the wide store. The chain we use needs to be the chain of the
12031 // latest store node which is *used* and replaced by the wide store.
12032 if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
12033 LatestNodeUsed = i;
12034 }
12035
12036 SmallVector Chains;
1203712088
1203812089 // The latest Node in the DAG.
12039 LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
1204012090 SDLoc DL(StoreNodes[0].MemNode);
1204112091
1204212092 SDValue StoredVal;
1205212102 assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
1205312103
1205412104 if (IsConstantSrc) {
12055 StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty);
12105 SmallVector BuildVector;
12106 for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
12107 StoreSDNode *St = cast(StoreNodes[I].MemNode);
12108 SDValue Val = St->getValue();
12109 if (MemVT.getScalarType().isInteger())
12110 if (auto *CFP = dyn_cast(St->getValue()))
12111 Val = DAG.getConstant(
12112 (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(),
12113 SDLoc(CFP), MemVT);
12114 BuildVector.push_back(Val);
12115 }
12116 StoredVal = DAG.getBuildVector(Ty, DL, BuildVector);
1205612117 } else {
1205712118 SmallVector Ops;
1205812119 for (unsigned i = 0; i < NumStores; ++i) {
1206212123 if (Val.getValueType() != MemVT)
1206312124 return false;
1206412125 Ops.push_back(Val);
12065 Chains.push_back(St->getChain());
1206612126 }
1206712127
1206812128 // Build the extracted vector elements back into a vector.
1208212142 for (unsigned i = 0; i < NumStores; ++i) {
1208312143 unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
1208412144 StoreSDNode *St = cast(StoreNodes[Idx].MemNode);
12085 Chains.push_back(St->getChain());
1208612145
1208712146 SDValue Val = St->getValue();
1208812147 StoreInt <<= ElementSizeBytes * 8;
1210012159 StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
1210112160 }
1210212161
12103 assert(!Chains.empty());
12104
12162 SmallVector Chains;
12163
12164 // Gather all Chains we're inheriting. As generally all chains are
12165 // equal, do minor check to remove obvious redundancies.
12166 Chains.push_back(StoreNodes[0].MemNode->getChain());
12167 for (unsigned i = 1; i < NumStores; ++i)
12168 if (StoreNodes[0].MemNode->getChain() != StoreNodes[i].MemNode->getChain())
12169 Chains.push_back(StoreNodes[i].MemNode->getChain());
12170
12171 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
1210512172 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1210612173 SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,
1210712174 FirstInChain->getBasePtr(),
1210812175 FirstInChain->getPointerInfo(),
1210912176 FirstInChain->getAlignment());
1211012177
12111 bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
12112 : DAG.getSubtarget().useAA();
12113 if (UseAA) {
12114 // Replace all merged stores with the new store.
12115 for (unsigned i = 0; i < NumStores; ++i)
12116 CombineTo(StoreNodes[i].MemNode, NewStore);
12117 } else {
12118 // Replace the last store with the new store.
12119 CombineTo(LatestOp, NewStore);
12120 // Erase all other stores.
12121 for (unsigned i = 0; i < NumStores; ++i) {
12122 if (StoreNodes[i].MemNode == LatestOp)
12123 continue;
12124 StoreSDNode *St = cast(StoreNodes[i].MemNode);
12125 // ReplaceAllUsesWith will replace all uses that existed when it was
12126 // called, but graph optimizations may cause new ones to appear. For
12127 // example, the case in pr14333 looks like
12128 //
12129 // St's chain -> St -> another store -> X
12130 //
12131 // And the only difference from St to the other store is the chain.
12132 // When we change it's chain to be St's chain they become identical,
12133 // get CSEed and the net result is that X is now a use of St.
12134 // Since we know that St is redundant, just iterate.
12135 while (!St->use_empty())
12136 DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
12137 deleteAndRecombine(St);
12138 }
12139 }
12140
12141 StoreNodes.erase(StoreNodes.begin() + NumStores, StoreNodes.end());
12178 // Replace all merged stores with the new store.
12179 for (unsigned i = 0; i < NumStores; ++i)
12180 CombineTo(StoreNodes[i].MemNode, NewStore);
12181
12182 AddToWorklist(NewChain.getNode());
1214212183 return true;
1214312184 }
1214412185
12145 void DAGCombiner::getStoreMergeAndAliasCandidates(
12146 StoreSDNode* St, SmallVectorImpl &StoreNodes,
12147 SmallVectorImpl &AliasLoadNodes) {
12186 void DAGCombiner::getStoreMergeCandidates(
12187 StoreSDNode *St, SmallVectorImpl &StoreNodes) {
1214812188 // This holds the base pointer, index, and the offset in bytes from the base
1214912189 // pointer.
1215012190 BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
12191 EVT MemVT = St->getMemoryVT();
1215112192
1215212193 // We must have a base and an offset.
1215312194 if (!BasePtr.Base.getNode())
1215712198 if (BasePtr.Base.isUndef())
1215812199 return;
1215912200
12160 // Walk up the chain and look for nodes with offsets from the same
12161 // base pointer. Stop when reaching an instruction with a different kind
12162 // or instruction which has a different base pointer.
12163 EVT MemVT = St->getMemoryVT();
12164 unsigned Seq = 0;
12165 StoreSDNode *Index = St;
12166
12167
12168 bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
12169 : DAG.getSubtarget().useAA();
12170
12171 if (UseAA) {
12172 // Look at other users of the same chain. Stores on the same chain do not
12173 // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized
12174 // to be on the same chain, so don't bother looking at adjacent chains.
12175
12176 SDValue Chain = St->getChain();
12177 for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) {
12178 if (StoreSDNode *OtherST = dyn_cast(*I)) {
12179 if (I.getOperandNo() != 0)
12180 continue;
12181
12182 if (OtherST->isVolatile() || OtherST->isIndexed())
12183 continue;
12184
12185 if (OtherST->getMemoryVT() != MemVT)
12186 continue;
12187
12188 BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG);
12189
12190 if (Ptr.equalBaseIndex(BasePtr))
12191 StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++));
12192 }
12193 }
12194
12195 return;
12196 }
12197
12198 while (Index) {
12199 // If the chain has more than one use, then we can't reorder the mem ops.
12200 if (Index != St && !SDValue(Index, 0)->hasOneUse())
12201 break;
12202
12203 // Find the base pointer and offset for this memory node.
12204 BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
12205
12206 // Check that the base pointer is the same as the original one.
12207 if (!Ptr.equalBaseIndex(BasePtr))
12208 break;
12209
12210 // The memory operands must not be volatile.
12211 if (Index->isVolatile() || Index->isIndexed())
12212 break;
12213
12214 // No truncation.
12215 if (Index->isTruncatingStore())
12216 break;
12217
12218 // The stored memory type must be the same.
12219 if (Index->getMemoryVT() != MemVT)
12220 break;
12221
12222 // We do not allow under-aligned stores in order to prevent
12223 // overriding stores. NOTE: this is a bad hack. Alignment SHOULD
12224 // be irrelevant here; what MATTERS is that we not move memory
12225 // operations that potentially overlap past each-other.
12226 if (Index->getAlignment() < MemVT.getStoreSize())
12227 break;
12228
12229 // We found a potential memory operand to merge.
12230 StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
12231
12232 // Find the next memory operand in the chain. If the next operand in the
12233 // chain is a store then move up and continue the scan with the next
12234 // memory operand. If the next operand is a load save it and use alias
12235 // information to check if it interferes with anything.
12236 SDNode *NextInChain = Index->getChain().getNode();
12237 while (1) {
12238 if (StoreSDNode *STn = dyn_cast(NextInChain)) {
12239 // We found a store node. Use it for the next iteration.
12240 Index = STn;
12241 break;
12242 } else if (LoadSDNode *Ldn = dyn_cast(NextInChain)) {
12243 if (Ldn->isVolatile()) {
12244 Index = nullptr;
12245 break;
12201 // We looking for a root node which is an ancestor to all mergable
12202 // stores. We search up through a load, to our root and then down
12203 // through all children. For instance we will find Store{1,2,3} if
12204 // St is Store1, Store2. or Store3 where the root is not a load
12205 // which always true for nonvolatile ops. TODO: Expand
12206 // the search to find all valid candidates through multiple layers of loads.
12207 //
12208 // Root
12209 // |-------|-------|
12210 // Load Load Store3
12211 // | |
12212 // Store1 Store2
12213 //
12214 // FIXME: We should be able to climb and
12215 // descend TokenFactors to find candidates as well.
12216
12217 SDNode *RootNode = (St->getChain()).getNode();
12218
12219 // Set of Parents of Candidates
12220 std::set CandidateParents;
12221
12222 if (LoadSDNode *Ldn = dyn_cast(RootNode)) {
12223 RootNode = Ldn->getChain().getNode();
12224 for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
12225 if (I.getOperandNo() == 0 && isa(*I)) // walk down chain
12226 CandidateParents.insert(*I);
12227 } else
12228 CandidateParents.insert(RootNode);
12229
12230 bool IsLoadSrc = isa(St->getValue());
12231 bool IsConstantSrc = isa(St->getValue()) ||
12232 isa(St->getValue());
12233 bool IsExtractVecSrc =
12234 (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
12235 St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
12236 auto CorrectValueKind = [&](StoreSDNode *Other) -> bool {
12237 if (IsLoadSrc)
12238 return isa(Other->getValue());
12239 if (IsConstantSrc)
12240 return (isa(Other->getValue()) ||
12241 isa(Other->getValue()));
12242 if (IsExtractVecSrc)
12243 return (Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
12244 Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
12245 return false;
12246 };
12247
12248 // check all parents of mergable children
12249 for (auto P = CandidateParents.begin(); P != CandidateParents.end(); ++P)
12250 for (auto I = (*P)->use_begin(), E = (*P)->use_end(); I != E; ++I)
12251 if (I.getOperandNo() == 0)
12252 if (StoreSDNode *OtherST = dyn_cast(*I)) {
12253 if (OtherST->isVolatile() || OtherST->isIndexed())
12254 continue;
12255 // We can merge constant floats to equivalent integers
12256 if (OtherST->getMemoryVT() != MemVT)
12257 if (!(MemVT.isInteger() && MemVT.bitsEq(OtherST->getMemoryVT()) &&
12258 isa(OtherST->getValue())))
12259 continue;
12260 BaseIndexOffset Ptr =
12261 BaseIndexOffset::match(OtherST->getBasePtr(), DAG);
12262 if (Ptr.equalBaseIndex(BasePtr) && CorrectValueKind(OtherST))
12263 StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset));
1224612264 }
12247
12248 // Save the load node for later. Continue the scan.
12249 AliasLoadNodes.push_back(Ldn);
12250 NextInChain = Ldn->getChain().getNode();
12251 continue;
12252 } else {
12253 Index = nullptr;
12254 break;
12255 }
12256 }
12257 }
1225812265 }
1225912266
1226012267 // We need to check that merging these stores does not cause a loop
1228112288 return true;
1228212289 }
1228312290
12284 bool DAGCombiner::MergeConsecutiveStores(
12285 StoreSDNode* St, SmallVectorImpl &StoreNodes) {
12291 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
1228612292 if (OptLevel == CodeGenOpt::None)
1228712293 return false;
1228812294
1228912295 EVT MemVT = St->getMemoryVT();
1229012296 int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
12297
12298 if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
12299 return false;
12300
1229112301 bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(
1229212302 Attribute::NoImplicitFloat);
1229312303
1231612326 if (MemVT.isVector() && IsLoadSrc)
1231712327 return false;
1231812328
12319 // Only look at ends of store sequences.
12320 SDValue Chain = SDValue(St, 0);
12321 if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
12322 return false;
12323
12324 // Save the LoadSDNodes that we find in the chain.
12325 // We need to make sure that these nodes do not interfere with
12326 // any of the store nodes.
12327 SmallVector AliasLoadNodes;
12328
12329 getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes);
12329 SmallVector StoreNodes;
12330 // Find potential store merge candidates by searching through chain sub-DAG
12331 getStoreMergeCandidates(St, StoreNodes);
1233012332
1233112333 // Check if there is anything to merge.
1233212334 if (StoreNodes.size() < 2)
1233312335 return false;
1233412336
12335 // only do dependence check in AA case
12336 bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
12337 : DAG.getSubtarget().useAA();
12338 if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes))
12337 // Check that we can merge these candidates without causing a cycle
12338 if (!checkMergeStoreCandidatesForDependencies(StoreNodes))
1233912339 return false;
1234012340
1234112341 // Sort the memory operands according to their distance from the
12342 // base pointer. As a secondary criteria: make sure stores coming
12343 // later in the code come first in the list. This is important for
12344 // the non-UseAA case, because we're merging stores into the FINAL
12345 // store along a chain which potentially contains aliasing stores.
12346 // Thus, if there are multiple stores to the same address, the last
12347 // one can be considered for merging but not the others.
12342 // base pointer.
1234812343 std::sort(StoreNodes.begin(), StoreNodes.end(),
1234912344 [](MemOpLink LHS, MemOpLink RHS) {
12350 return LHS.OffsetFromBase < RHS.OffsetFromBase ||
12351 (LHS.OffsetFromBase == RHS.OffsetFromBase &&
12352 LHS.SequenceNum < RHS.SequenceNum);
12353 });
12345 return LHS.OffsetFromBase < RHS.OffsetFromBase;
12346 });
1235412347
1235512348 // Scan the memory operations on the chain and find the first non-consecutive
1235612349 // store memory address.
12357 unsigned LastConsecutiveStore = 0;
12350 unsigned NumConsecutiveStores = 0;
1235812351 int64_t StartAddress = StoreNodes[0].OffsetFromBase;
12359 for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) {
12360
12361 // Check that the addresses are consecutive starting from the second
12362 // element in the list of stores.
12363 if (i > 0) {
12364 int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
12365 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
12366 break;
12367 }
12368
12369 // Check if this store interferes with any of the loads that we found.
12370 // If we find a load that alias with this store. Stop the sequence.
12371 if (any_of(AliasLoadNodes, [&](LSBaseSDNode *Ldn) {
12372 return isAlias(Ldn, StoreNodes[i].MemNode);
12373 }))
12352
12353 // Check that the addresses are consecutive starting from the second
12354 // element in the list of stores.
12355 for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
12356 int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
12357 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
1237412358 break;
12375
12376 // Mark this node as useful.
12377 LastConsecutiveStore = i;
12378 }
12359 NumConsecutiveStores = i + 1;
12360 }
12361
12362 if (NumConsecutiveStores < 2)
12363 return false;
1237912364
1238012365 // The node with the lowest store address.
12381 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
12382 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
12383 unsigned FirstStoreAlign = FirstInChain->getAlignment();
1238412366 LLVMContext &Context = *DAG.getContext();
1238512367 const DataLayout &DL = DAG.getDataLayout();
1238612368
1238712369 // Store the constants into memory as one consecutive store.
1238812370 if (IsConstantSrc) {
12389 unsigned LastLegalType = 0;
12390 unsigned LastLegalVectorType = 0;
12391 bool NonZero = false;
12392 for (unsigned i=0; i
12393 StoreSDNode *St = cast(StoreNodes[i].MemNode);
12394 SDValue StoredVal = St->getValue();
12395
12396 if (ConstantSDNode *C = dyn_cast(StoredVal)) {
12397 NonZero |= !C->isNullValue();
12398 } else if (ConstantFPSDNode *C = dyn_cast(StoredVal)) {
12399 NonZero |= !C->getConstantFPValue()->isNullValue();
12400 } else {
12401 // Non-constant.
12402 break;
12403 }
12404
12405 // Find a legal type for the constant store.
12406 unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
12407 EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
12408 bool IsFast;
12409 if (TLI.isTypeLegal(StoreTy) &&
12410 TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
12411 FirstStoreAlign, &IsFast) && IsFast) {
12412 LastLegalType = i+1;
12413 // Or check whether a truncstore is legal.
12414 } else if (TLI.getTypeAction(Context, StoreTy) ==
12415 TargetLowering::TypePromoteInteger) {
12416 EVT LegalizedStoredValueTy =
12417 TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
12418 if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
12419 TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
12420 FirstStoreAS, FirstStoreAlign, &IsFast) &&
12371 bool RV = false;
12372 while (NumConsecutiveStores > 1) {
12373 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
12374 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
12375 unsigned FirstStoreAlign = FirstInChain->getAlignment();
12376 unsigned LastLegalType = 0;
12377 unsigned LastLegalVectorType = 0;
12378 bool NonZero = false;
12379 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
12380 StoreSDNode *ST = cast(StoreNodes[i].MemNode);
12381 SDValue StoredVal = ST->getValue();
12382
12383 if (ConstantSDNode *C = dyn_cast(StoredVal)) {
12384 NonZero |= !C->isNullValue();
12385 } else if (ConstantFPSDNode *C =
12386 dyn_cast(StoredVal)) {
12387 NonZero |= !C->getConstantFPValue()->isNullValue();
12388 } else {
12389 // Non-constant.
12390 break;
12391 }
12392
12393 // Find a legal type for the constant store.
12394 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
12395 EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
12396 bool IsFast = false;
12397 if (TLI.isTypeLegal(StoreTy) &&
12398 TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
12399 FirstStoreAlign, &IsFast) &&
1242112400 IsFast) {
1242212401 LastLegalType = i + 1;
12402 // Or check whether a truncstore is legal.
12403 } else if (TLI.getTypeAction(Context, StoreTy) ==
12404 TargetLowering::TypePromoteInteger) {
12405 EVT LegalizedStoredValueTy =
12406 TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
12407 if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
12408 TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
12409 FirstStoreAS, FirstStoreAlign, &IsFast) &&
12410 IsFast) {
12411 LastLegalType = i + 1;
12412 }
12413 }
12414
12415 // We only use vectors if the constant is known to be zero or the target
12416 // allows it and the function is not marked with the noimplicitfloat
12417 // attribute.
12418 if ((!NonZero ||
12419 TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
12420 !NoVectors) {
12421 // Find a legal type for the vector store.
12422 EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1);
12423 if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(Ty) &&
12424 TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
12425 FirstStoreAlign, &IsFast) &&
12426 IsFast)
12427 LastLegalVectorType = i + 1;
1242312428 }
1242412429 }
1242512430
12426 // We only use vectors if the constant is known to be zero or the target
12427 // allows it and the function is not marked with the noimplicitfloat
12428 // attribute.
12429 if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1,
12430 FirstStoreAS)) &&
12431 !NoVectors) {
12432 // Find a legal type for the vector store.
12433 EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
12434 if (TLI.isTypeLegal(Ty) &&
12435 TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
12436 FirstStoreAlign, &IsFast) && IsFast)
12437 LastLegalVectorType = i + 1;
12438 }
12439 }
12440
12441 // Check if we found a legal integer type to store.
12442 if (LastLegalType == 0 && LastLegalVectorType == 0)
12443 return false;
12444
12445 bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
12446 unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
12447
12448 return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
12449 true, UseVector);
12431 // Check if we found a legal integer type that creates a meaningful merge.
12432 if (LastLegalType < 2 && LastLegalVectorType < 2)
12433 break;
12434
12435 bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
12436 unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
12437
12438 bool Merged = MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
12439 true, UseVector);
12440 if (!Merged)
12441 break;
12442 // Remove merged stores for next iteration.
12443 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
12444 RV = true;
12445 NumConsecutiveStores -= NumElem;
12446 }
12447 return RV;
1245012448 }
1245112449
1245212450 // When extracting multiple vector elements, try to store them
1245312451 // in one vector store rather than a sequence of scalar stores.
1245412452 if (IsExtractVecSrc) {
12453 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
12454 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
12455 unsigned FirstStoreAlign = FirstInChain->getAlignment();
1245512456 unsigned NumStoresToMerge = 0;
1245612457 bool IsVec = MemVT.isVector();
12457 for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
12458 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
1245812459 StoreSDNode *St = cast(StoreNodes[i].MemNode);
1245912460 unsigned StoreValOpcode = St->getValue().getOpcode();
1246012461 // This restriction could be loosened.
1249412495 // Find acceptable loads. Loads need to have the same chain (token factor),
1249512496 // must not be zext, volatile, indexed, and they must be consecutive.
1249612497 BaseIndexOffset LdBasePtr;
12497 for (unsigned i=0; i; ++i) {
12498 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
1249812499 StoreSDNode *St = cast(StoreNodes[i].MemNode);
1249912500 LoadSDNode *Ld = dyn_cast(St->getValue());
1250012501 if (!Ld) break;
1252712528 }
1252812529
1252912530 // We found a potential memory operand to merge.
12530 LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0));
12531 LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset));
1253112532 }
1253212533
1253312534 if (LoadNodes.size() < 2)
1253912540 if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
1254012541 St->getAlignment() >= RequiredAlignment)
1254112542 return false;
12542
12543 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
12544 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
12545 unsigned FirstStoreAlign = FirstInChain->getAlignment();
1254312546 LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode);
1254412547 unsigned FirstLoadAS = FirstLoad->getAddressSpace();
1254512548 unsigned FirstLoadAlign = FirstLoad->getAlignment();
1260812611
1260912612 // We add +1 here because the LastXXX variables refer to location while
1261012613 // the NumElem refers to array/index size.
12611 unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1;
12614 unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
1261212615 NumElem = std::min(LastLegalType, NumElem);
1261312616
1261412617 if (NumElem < 2)
1261512618 return false;
1261612619
12617 // Collect the chains from all merged stores.
12620 // Collect the chains from all merged stores. Because the common case
12621 // all chains are the same, check if we match the first Chain.
1261812622 SmallVector MergeStoreChains;
1261912623 MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain());
12620
12621 // The latest Node in the DAG.
12622 unsigned LatestNodeUsed = 0;
12623 for (unsigned i=1; i
12624 // Find a chain for the new wide-store operand. Notice that some
12625 // of the store nodes that we found may not be selected for inclusion
12626 // in the wide store. The chain we use needs to be the chain of the
12627 // latest store node which is *used* and replaced by the wide store.
12628 if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
12629 LatestNodeUsed = i;
12630
12631 MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain());
12632 }
12633
12634 LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
12624 for (unsigned i = 1; i < NumElem; ++i)
12625 if (StoreNodes[0].MemNode->getChain() != StoreNodes[i].MemNode->getChain())
12626 MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain());
1263512627
1263612628 // Find if it is better to use vectors or integers to load and store
1263712629 // to memory.
1265512647 SDValue NewStoreChain =
1265612648 DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains);
1265712649
12650 AddToWorklist(NewStoreChain.getNode());
12651
1265812652 SDValue NewStore =
1265912653 DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
1266012654 FirstInChain->getPointerInfo(), FirstStoreAlign);
1266612660 SDValue(NewLoad.getNode(), 1));
1266712661 }
1266812662
12669 if (UseAA) {
12670 // Replace the all stores with the new store.
12671 for (unsigned i = 0; i < NumElem; ++i)
12672 CombineTo(StoreNodes[i].MemNode, NewStore);
12673 } else {
12674 // Replace the last store with the new store.
12675 CombineTo(LatestOp, NewStore);
12676 // Erase all other stores.
12677 for (unsigned i = 0; i < NumElem; ++i) {
12678 // Remove all Store nodes.
12679 if (StoreNodes[i].MemNode == LatestOp)
12680 continue;
12681 StoreSDNode *St = cast(StoreNodes[i].MemNode);
12682 DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
12683 deleteAndRecombine(St);
12684 }
12685 }
12686
12687 StoreNodes.erase(StoreNodes.begin() + NumElem, StoreNodes.end());
12663 // Replace the all stores with the new store.
12664 for (unsigned i = 0; i < NumElem; ++i)
12665 CombineTo(StoreNodes[i].MemNode, NewStore);
1268812666 return true;
1268912667 }
1269012668
1284112819 if (SDValue NewST = TransformFPLoadStorePair(N))
1284212820 return NewST;
1284312821
12844 bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
12845 : DAG.getSubtarget().useAA();
12846 #ifndef NDEBUG
12847 if (CombinerAAOnlyFunc.getNumOccurrences() &&
12848 CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
12849 UseAA = false;
12850 #endif
12851 if (UseAA && ST->isUnindexed()) {
12852 // FIXME: We should do this even without AA enabled. AA will just allow
12853 // FindBetterChain to work in more situations. The problem with this is that
12854 // any combine that expects memory operations to be on consecutive chains
12855 // first needs to be updated to look for users of the same chain.
12856
12822 if (ST->isUnindexed()) {
1285712823 // Walk up chain skipping non-aliasing memory nodes, on this store and any
1285812824 // adjacent stores.
1285912825 if (findBetterNeighborChains(ST)) {
1288712853 if (SimplifyDemandedBits(
1288812854 Value,
1288912855 APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
12890 ST->getMemoryVT().getScalarSizeInBits())))
12856 ST->getMemoryVT().getScalarSizeInBits()))) {
12857 // Re-visit the store if anything changed and the store hasn't been merged
12858 // with another node (N is deleted) SimplifyDemandedBits will add Value's
12859 // node back to the worklist if necessary, but we also need to re-visit
12860 // the Store node itself.
12861 if (N->getOpcode() != ISD::DELETED_NODE)
12862 AddToWorklist(N);
1289112863 return SDValue(N, 0);
12864 }
1289212865 }
1289312866
1289412867 // If this is a load followed by a store to the same location, then the store
1293212905 // There can be multiple store sequences on the same chain.
1293312906 // Keep trying to merge store sequences until we are unable to do so
1293412907 // or until we merge the last store on the chain.
12935 SmallVector StoreNodes;
12936 bool Changed = MergeConsecutiveStores(ST, StoreNodes);
12908 bool Changed = MergeConsecutiveStores(ST);
1293712909 if (!Changed) break;
12938
12939 if (any_of(StoreNodes,
12940 [ST](const MemOpLink &Link) { return Link.MemNode == ST; })) {
12941 // ST has been merged and no longer exists.
12910 // Return N as merge only uses CombineTo and no worklist clean
12911 // up is necessary.
12912 if (N->getOpcode() == ISD::DELETED_NODE || !isa(N))
1294212913 return SDValue(N, 0);
12943 }
1294412914 }
1294512915 }
1294612916
1294912919 // Make sure to do this only after attempting to merge stores in order to
1295012920 // avoid changing the types of some subset of stores due to visit order,
1295112921 // preventing their merging.
12952 if (isa(Value)) {
12922 if (isa(ST->getValue())) {
1295312923 if (SDValue NewSt = replaceStoreOfFPConstant(ST))
1295412924 return NewSt;
1295512925 }
1388513855 // A vector built entirely of undefs is undef.
1388613856 if (ISD::allOperandsUndef(N))
1388713857 return DAG.getUNDEF(VT);
13858
13859 // Check if we can express BUILD VECTOR via subvector extract.
13860 if (!LegalTypes && (N->getNumOperands() > 1)) {
13861 SDValue Op0 = N->getOperand(0);
13862 auto checkElem = [&](SDValue Op) -> uint64_t {
13863 if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
13864 (Op0.getOperand(0) == Op.getOperand(0)))
13865 if (auto CNode = dyn_cast(Op.getOperand(1)))
13866 return CNode->getZExtValue();
13867 return -1;
13868 };
13869
13870 int Offset = checkElem(Op0);
13871 for (unsigned i = 0; i < N->getNumOperands(); ++i) {
13872 if (Offset + i != checkElem(N->getOperand(i))) {
13873 Offset = -1;
13874 break;
13875 }
13876 }
13877
13878 if ((Offset == 0) &&
13879 (Op0.getOperand(0).getValueType() == N->getValueType(0)))
13880 return Op0.getOperand(0);
13881 if ((Offset != -1) &&
13882 ((Offset % N->getValueType(0).getVectorNumElements()) ==
13883 0)) // IDX must be multiple of output size.
13884 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
13885 Op0.getOperand(0), Op0.getOperand(1));
13886 }
1388813887
1388913888 if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
1389013889 return V;
1598215981 if (Base.getOpcode() == ISD::ADD) {
1598315982 if (ConstantSDNode *C = dyn_cast(Base.getOperand(1))) {
1598415983 Base = Base.getOperand(0);
15985 Offset += C->getZExtValue();
15984 Offset += C->getSExtValue();
1598615985 }
1598715986 }
1598815987
1617916178 ++Depth;
1618016179 break;
1618116180
16181 case ISD::CopyFromReg:
16182 // Forward past CopyFromReg.
16183 Chains.push_back(Chain.getOperand(0));
16184 ++Depth;
16185 break;
16186
1618216187 default:
1618316188 // For all other instructions we will just have to take what we can get.
1618416189 Aliases.push_back(Chain);
1620716212 return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
1620816213 }
1620916214
16215 // This function tries to collect a bunch of potentially interesting
16216 // nodes to improve the chains of, all at once. This might seem
16217 // redundant, as this function gets called when visiting every store
16218 // node, so why not let the work be done on each store as it's visited?
16219 //
16220 // I believe this is mainly important because MergeConsecutiveStores
16221 // is unable to deal with merging stores of different sizes, so unless
16222 // we improve the chains of all the potential candidates up-front
16223 // before running MergeConsecutiveStores, it might only see some of
16224 // the nodes that will eventually be candidates, and then not be able
16225 // to go from a partially-merged state to the desired final
16226 // fully-merged state.
1621016227 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
1621116228 // This holds the base pointer, index, and the offset in bytes from the base
1621216229 // pointer.
1624216259 if (!Ptr.equalBaseIndex(BasePtr))
1624316260 break;
1624416261
16245 // Find the next memory operand in the chain. If the next operand in the
16246 // chain is a store then move up and continue the scan with the next
16247 // memory operand. If the next operand is a load save it and use alias
16248 // information to check if it interferes with anything.
16262 // Walk up the chain to find the next store node, ignoring any
16263 // intermediate loads. Any other kind of node will halt the loop.
1624916264 SDNode *NextInChain = Index->getChain().getNode();
1625016265 while (true) {
1625116266 if (StoreSDNode *STn = dyn_cast(NextInChain)) {
1626416279 Index = nullptr;
1626516280 break;
1626616281 }
16267 }
16268 }
16269
16282 } // end while
16283 }
16284
16285 // At this point, ChainedStores lists all of the Store nodes
16286 // reachable by iterating up through chain nodes matching the above
16287 // conditions. For each such store identified, try to find an
16288 // earlier chain to attach the store to which won't violate the
16289 // required ordering.
1627016290 bool MadeChangeToSt = false;
1627116291 SmallVector, 8> BetterChains;
1627216292
849849 MinFunctionAlignment = 0;
850850 PrefFunctionAlignment = 0;
851851 PrefLoopAlignment = 0;
852 GatherAllAliasesMaxDepth = 6;
852 GatherAllAliasesMaxDepth = 18;
853853 MinStackArgumentAlignment = 1;
854854 // TODO: the default will be switched to 0 in the next commit, along
855855 // with the Target-specific changes necessary.
93379337 return SDValue();
93389338 }
93399339
9340 /// This function handles the log2-shuffle pattern produced by the
9340 /// This function handles the log2-shuffle pattern produced by the
93419341 /// LoopVectorizer for the across vector reduction. It consists of
93429342 /// log2(NumVectorElements) steps and, in each step, 2^(s) elements
93439343 /// are reduced, where s is an induction variable from 0 to
498498
499499 bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
500500 unsigned &Cost) const override;
501
502 bool canMergeStoresTo(EVT MemVT) const override {
503 // Do not merge to larger than i32.
504 return (MemVT.getSizeInBits() <= 32);
505 }
501506
502507 bool isCheapToSpeculateCttz() const override;
503508 bool isCheapToSpeculateCtlz() const override;
5858 }
5959
6060 ; [2 x float] should not be promoted to double by the Darwin varargs handling,
61 ; but should go in an 8-byte aligned slot.
61 ; but should go in an 8-byte aligned slot and can be merged as integer stores.
6262 define void @test_varargs_stackalign() {
6363 ; CHECK-LABEL: test_varargs_stackalign:
64 ; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
64 ; CHECK-DARWINPCS: str {{x[0-9]+}}, [sp, #16]
6565
6666 call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
6767 ret void
204204 define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
205205 entry:
206206 ; CHECK-LABEL: test8
207 ; CHECK: strb {{w[0-9]+}}, [sp, #3]
208 ; CHECK: strb wzr, [sp, #2]
209 ; CHECK: strb {{w[0-9]+}}, [sp, #1]
210 ; CHECK: strb wzr, [sp]
207 ; CHECK: str w8, [sp]
211208 ; CHECK: bl
212209 ; FAST-LABEL: test8
213210 ; FAST: strb {{w[0-9]+}}, [sp]
1212 entry:
1313 ; CHECK-LABEL: t2:
1414 ; CHECK: strh wzr, [sp, #32]
15 ; CHECK: stp xzr, xzr, [sp, #16]
16 ; CHECK: str xzr, [sp, #8]
15 ; CHECK: stp xzr, xzr, [sp, #8]
16 ; CHECK: str xzr, [sp, #24]
1717 %buf = alloca [26 x i8], align 1
1818 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
1919 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
9898 ; __stack field should point just past them.
9999 define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) {
100100 ; CHECK-LABEL: test_offsetstack:
101 ; CHECK: sub sp, sp, #80
101 ; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]!
102102 ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
103103 ; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var
104104 ; CHECK: str [[STACK_TOP]], [x[[VAR]]]
33 @g0 = external global <3 x float>, align 16
44 @g1 = external global <3 x float>, align 4
55
6 ; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
7 ; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
6 ; CHECK: ldr q[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0
87 ; CHECK: str d[[R0]]
98
109 define void @blam() {
None ; RUN: llc --combiner-alias-analysis=false < %s | FileCheck %s
1 ; RUN: llc --combiner-alias-analysis=true < %s | FileCheck %s
0 ; RUN: llc < %s | FileCheck %s
21
32 ; This test checks that we do not merge stores together which have
43 ; dependencies through their non-chain operands (e.g. one store is the
None ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
1 ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECKNOP
12
2 ; CHECK: test01.cl:2:{{[0-9]+}}
3 ; CHECK-NEXT: s_nop 0
3 ; This test expects that we have one instance for each line in some order with "s_nop 0" instances after each.
44
5 ; CHECK: test01.cl:3:{{[0-9]+}}
6 ; CHECK-NEXT: s_nop 0
5 ; Check that each line appears at least once
6 ; CHECK-DAG: test01.cl:2:3
7 ; CHECK-DAG: test01.cl:3:3
8 ; CHECK-DAG: test01.cl:4:3
79
8 ; CHECK: test01.cl:4:{{[0-9]+}}
9 ; CHECK-NEXT: s_nop 0
10
11 ; Check that each of each of the lines consists of the line output, followed by "s_nop 0"
12 ; CHECKNOP: test01.cl:{{[234]}}:3
13 ; CHECKNOP-NEXT: s_nop 0
14 ; CHECKNOP: test01.cl:{{[234]}}:3
15 ; CHECKNOP-NEXT: s_nop 0
16 ; CHECKNOP: test01.cl:{{[234]}}:3
17 ; CHECKNOP-NEXT: s_nop 0
1018
1119 ; CHECK: test01.cl:5:{{[0-9]+}}
1220 ; CHECK-NEXT: s_nop 0
2028 call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
2129 %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
2230 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
23 store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
31 store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !20
2432 %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
2533 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
2634 store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
252252 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
253253 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
254254
255 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
256 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
255257 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
256 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
257 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
258
259 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
260258
261259 ; GCN-NO-TONGA: buffer_load_ubyte
262260 ; GCN-NO-TONGA: buffer_load_ubyte
None ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
2
3 ; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
52
63 ; This test is mostly to test DAG store merging, so disable the vectorizer.
74 ; Run with devices with different unaligned load restrictions.
149146 }
150147
151148 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
152 ; GCN-NOAA: buffer_store_dwordx4 v
153
154 ; GCN-AA: buffer_store_dwordx2
155 ; GCN-AA: buffer_store_dword v
156 ; GCN-AA: buffer_store_dword v
157
149 ; GCN-AA: buffer_store_dwordx4 v
158150 ; GCN: s_endpgm
159151 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
160152 %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
473465 ret void
474466 }
475467
476 ; This works once AA is enabled on the subtarget
477468 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
478469 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
479
480 ; GCN-NOAA: buffer_store_dword v
481 ; GCN-NOAA: buffer_store_dword v
482 ; GCN-NOAA: buffer_store_dword v
483 ; GCN-NOAA: buffer_store_dword v
484
485 ; GCN-AA: buffer_store_dwordx4 [[LOAD]]
486
470 ; GCN: buffer_store_dwordx4 [[LOAD]]
487471 ; GCN: s_endpgm
488472 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
489473 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
3131 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
3232 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
3333
34 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
35 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
36 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
37 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
34 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
35 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
36 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
37 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
3838 define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
3939 entry:
4040 %tid = call i32 @llvm.amdgcn.workitem.id.x()
129129 ; HSA-ELT8: private_element_size = 2
130130 ; HSA-ELT4: private_element_size = 1
131131
132 ; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
133 ; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
132 ; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:1
133 ; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:2
134134
135135 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
136136
156156
157157 ; FUNC-LABEL: @reorder_local_offsets
158158 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
159 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
160 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
161 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
159 ; CI-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100
160 ; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
162161 ; CI: buffer_store_dword
163162 ; CI: s_endpgm
164163 define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
180179 }
181180
182181 ; FUNC-LABEL: @reorder_global_offsets
183 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
184 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
185 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
186 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
187 ; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
188 ; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
182 ; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
183 ; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
184 ; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
185 ; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
186 ; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
187 ; CI: buffer_store_dword
189188 ; CI: s_endpgm
190189 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
191190 %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
1111 entry:
1212 ; CHECK: sub sp, sp, #12
1313 ; CHECK: sub sp, sp, #4
14 ; CHECK: stmib sp, {r1, r2, r3}
14 ; CHECK: add r0, sp, #4
15 ; CHECK: stm sp, {r0, r1, r2, r3}
1516 %g = alloca i8*
1617 %g1 = bitcast i8** %g to i8*
1718 call void @llvm.va_start(i8* %g1)
None ; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=NO-REALIGN
1 ; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=REALIGN
0 ; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s
21
32 ; rdar://12713765
43 ; When realign-stack is set to false, make sure we are not creating stack
76
87 define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" {
98 entry:
10 ; NO-REALIGN-LABEL: test1
11 ; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
12 ; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
13 ; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
14 ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
15 ; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
16 ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
17 ; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
18
19 ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
20 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
21 ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
22 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
23 ; NO-REALIGN: mov r[[R3:[0-9]+]], r[[R1]]
24 ; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]!
25 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]
26
27 ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48
28 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
29 ; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32
30 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
31 ; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
32 ; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
9 ; CHECK-LABEL: test1
10 ; CHECK: ldr r[[R1:[0-9]+]], [pc, r1]
11 ; CHECK: add r[[R2:[0-9]+]], r1, #48
12 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
13 ; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
14 ; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
15 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
16 ; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32
17 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
18 ; CHECK: mov r[[R1:[0-9]+]], sp
19 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
20 ; CHECK: add r[[R2:[0-9]+]], r[[R1]], #32
21 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
22 ; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
23 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
24 ; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
25 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
26 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
27 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
28 ; CHECK: add r[[R1:[0-9]+]], r0, #48
29 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
30 ; CHECK: add r[[R1:[0-9]+]], r0, #32
31 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
32 ; CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]!
33 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128]
3334 %retval = alloca <16 x float>, align 16
3435 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
3536 store <16 x float> %0, <16 x float>* %retval
4041
4142 define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
4243 entry:
43 ; REALIGN-LABEL: test2
44 ; REALIGN: bfc sp, #0, #6
45 ; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
46 ; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
47 ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
48 ; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
49 ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
50 ; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
51 ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
44 ; CHECK: ldr r[[R1:[0-9]+]], [pc, r1]
45 ; CHECK: add r[[R2:[0-9]+]], r[[R1]], #48
46 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
47 ; CHECK: mov r[[R2:[0-9]+]], r[[R1]]
48 ; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
49 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
50 ; CHECK: add r[[R1:[0-9]+]], r[[R1]], #32
51 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
52 ; CHECK: mov r[[R1:[0-9]+]], sp
53 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
54 ; CHECK: orr r[[R2:[0-9]+]], r[[R1]], #32
55 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
56 ; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
57 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
58 ; CHECK: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
59 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
60 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
61 ; CHECK: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
62 ; CHECK: add r[[R1:[0-9]+]], r0, #48
63 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
64 ; CHECK: add r[[R1:[0-9]+]], r0, #32
65 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
66 ; CHECK: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]!
67 ; CHECK: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128]
5268
5369
54 ; REALIGN: orr r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
55 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
56 ; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #32
57 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
58 ; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #16
59 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
60 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
61
62 ; REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48
63 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
64 ; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32
65 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
66 ; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
67 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
68 %retval = alloca <16 x float>, align 16
70 %retval = alloca <16 x float>, align 16
6971 %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
7072 store <16 x float> %0, <16 x float>* %retval
7173 %1 = load <16 x float>, <16 x float>* %retval
1515 ; an LDMIA was created with both a FrameIndex and an offset, which
1616 ; is not allowed.
1717
18 ; CHECK-WITH-LDRD: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8]
19 ; CHECK-WITH-LDRD: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp]
18 ; CHECK-WITH-LDRD-DAG: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8]
19 ; CHECK-WITH-LDRD-DAG: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp]
2020
21 ; CHECK-WITH-LDRD: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8]
22 ; CHECK-WITH-LDRD: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp]
21 ; CHECK-WITH-LDRD-DAG: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8]
22 ; CHECK-WITH-LDRD-DAG: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp]
2323
2424 ; We also want to ensure the register scavenger is working (i.e. an
2525 ; offset from sp can be generated), so we need two spills.
26 ; CHECK-WITHOUT-LDRD: add [[ADDRREG:[a-z0-9]+]], sp, #{{[0-9]+}}
27 ; CHECK-WITHOUT-LDRD: stm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}}
28 ; CHECK-WITHOUT-LDRD: stm sp, {r{{[0-9]+}}, r{{[0-9]+}}}
26 ; CHECK-WITHOUT-LDRD-DAG: add [[ADDRREG:[a-z0-9]+]], sp, #{{[0-9]+}}
27 ; CHECK-WITHOUT-LDRD-DAG: stm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}}
28 ; CHECK-WITHOUT-LDRD-DAG: stm sp, {r{{[0-9]+}}, r{{[0-9]+}}}
2929
3030 ; In principle LLVM may have to recalculate the offset. At the moment
3131 ; it reuses the original though.
32 ; CHECK-WITHOUT-LDRD: ldm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}}
33 ; CHECK-WITHOUT-LDRD: ldm sp, {r{{[0-9]+}}, r{{[0-9]+}}}
32 ; CHECK-WITHOUT-LDRD-DAG: ldm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}}
33 ; CHECK-WITHOUT-LDRD-DAG: ldm sp, {r{{[0-9]+}}, r{{[0-9]+}}}
3434
3535 store volatile i64 %val1, i64* %addr
3636 store volatile i64 %val2, i64* %addr
88 ; CHECK-LABEL: t:
99 ; CHECK: vpop {d8}
1010 ; CHECK-NOT: vpopne
11 ; CHECK: pop {r7, pc}
12 ; CHECK: vpop {d8}
1311 ; CHECK: pop {r7, pc}
1412 br i1 undef, label %if.else, label %if.then
1513
55 ; LE-LABEL: i24_or:
66 ; LE: @ BB#0:
77 ; LE-NEXT: ldrh r1, [r0]
8 ; LE-NEXT: ldrb r2, [r0, #2]
98 ; LE-NEXT: orr r1, r1, #384
10 ; LE-NEXT: strb r2, [r0, #2]
119 ; LE-NEXT: strh r1, [r0]
1210 ; LE-NEXT: mov pc, lr
1311 ;
3028 define void @i24_and_or(i24* %a) {
3129 ; LE-LABEL: i24_and_or:
3230 ; LE: @ BB#0:
33 ; LE-NEXT: ldrb r2, [r0, #2]
3431 ; LE-NEXT: ldrh r1, [r0]
35 ; LE-NEXT: strb r2, [r0, #2]
3632 ; LE-NEXT: mov r2, #16256
33 ; LE-NEXT: orr r2, r2, #49152
3734 ; LE-NEXT: orr r1, r1, #384
38 ; LE-NEXT: orr r2, r2, #49152
3935 ; LE-NEXT: and r1, r1, r2
4036 ; LE-NEXT: strh r1, [r0]
4137 ; LE-NEXT: mov pc, lr
4238 ;
4339 ; BE-LABEL: i24_and_or:
4440 ; BE: @ BB#0:
41 ; BE-NEXT: mov r1, #128
42 ; BE-NEXT: strb r1, [r0, #2]
4543 ; BE-NEXT: ldrh r1, [r0]
46 ; BE-NEXT: mov r2, #128
47 ; BE-NEXT: strb r2, [r0, #2]
4844 ; BE-NEXT: orr r1, r1, #1
4945 ; BE-NEXT: strh r1, [r0]
5046 ; BE-NEXT: mov pc, lr
5854 define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
5955 ; LE-LABEL: i24_insert_bit:
6056 ; LE: @ BB#0:
61 ; LE-NEXT: ldrb r3, [r0, #2]
6257 ; LE-NEXT: ldrh r2, [r0]
63 ; LE-NEXT: strb r3, [r0, #2]
6458 ; LE-NEXT: mov r3, #255
6559 ; LE-NEXT: orr r3, r3, #57088
6660 ; LE-NEXT: and r2, r2, r3
7064 ;
7165 ; BE-LABEL: i24_insert_bit:
7266 ; BE: @ BB#0:
73 ; BE-NEXT: ldrb r3, [r0, #2]
7467 ; BE-NEXT: ldrh r2, [r0]
75 ; BE-NEXT: strb r3, [r0, #2]
7668 ; BE-NEXT: mov r3, #57088
7769 ; BE-NEXT: orr r3, r3, #16711680
7870 ; BE-NEXT: and r2, r3, r2, lsl #8
9284 define void @i56_or(i56* %a) {
9385 ; LE-LABEL: i56_or:
9486 ; LE: @ BB#0:
95 ; LE-NEXT: mov r2, r0
96 ; LE-NEXT: ldr r12, [r0]
97 ; LE-NEXT: ldrh r3, [r2, #4]!
98 ; LE-NEXT: ldrb r1, [r2, #2]
99 ; LE-NEXT: strb r1, [r2, #2]
100 ; LE-NEXT: orr r1, r12, #384
87 ; LE-NEXT: ldr r1, [r0]
88 ; LE-NEXT: orr r1, r1, #384
10189 ; LE-NEXT: str r1, [r0]
102 ; LE-NEXT: strh r3, [r2]
10390 ; LE-NEXT: mov pc, lr
10491 ;
10592 ; BE-LABEL: i56_or:
127114 define void @i56_and_or(i56* %a) {
128115 ; LE-LABEL: i56_and_or:
129116 ; LE: @ BB#0:
130 ; LE-NEXT: mov r2, r0
131117 ; LE-NEXT: ldr r1, [r0]
132 ; LE-NEXT: ldrh r12, [r2, #4]!
133118 ; LE-NEXT: orr r1, r1, #384
134 ; LE-NEXT: ldrb r3, [r2, #2]
135119 ; LE-NEXT: bic r1, r1, #127
136 ; LE-NEXT: strb r3, [r2, #2]
137120 ; LE-NEXT: str r1, [r0]
138 ; LE-NEXT: strh r12, [r2]
139121 ; LE-NEXT: mov pc, lr
140122 ;
141123 ; BE-LABEL: i56_and_or:
142124 ; BE: @ BB#0:
143 ; BE-NEXT: .save {r11, lr}
144 ; BE-NEXT: push {r11, lr}
145 ; BE-NEXT: mov r2, r0
146 ; BE-NEXT: ldr lr, [r0]
125 ; BE-NEXT: mov r1, r0
147126 ; BE-NEXT: mov r3, #128
148 ; BE-NEXT: ldrh r12, [r2, #4]!
149 ; BE-NEXT: strb r3, [r2, #2]
150 ; BE-NEXT: lsl r3, r12, #8
151 ; BE-NEXT: orr r3, r3, lr, lsl #24
152 ; BE-NEXT: orr r3, r3, #384
153 ; BE-NEXT: lsr r1, r3, #8
154 ; BE-NEXT: strh r1, [r2]
155 ; BE-NEXT: bic r1, lr, #255
156 ; BE-NEXT: orr r1, r1, r3, lsr #24
127 ; BE-NEXT: ldrh r2, [r1, #4]!
128 ; BE-NEXT: strb r3, [r1, #2]
129 ; BE-NEXT: lsl r2, r2, #8
130 ; BE-NEXT: ldr r12, [r0]
131 ; BE-NEXT: orr r2, r2, r12, lsl #24
132 ; BE-NEXT: orr r2, r2, #384
133 ; BE-NEXT: lsr r3, r2, #8
134 ; BE-NEXT: strh r3, [r1]
135 ; BE-NEXT: bic r1, r12, #255
136 ; BE-NEXT: orr r1, r1, r2, lsr #24
157137 ; BE-NEXT: str r1, [r0]
158 ; BE-NEXT: pop {r11, lr}
159138 ; BE-NEXT: mov pc, lr
139
160140 %b = load i56, i56* %a, align 1
161141 %c = and i56 %b, -128
162142 %d = or i56 %c, 384
167147 define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
168148 ; LE-LABEL: i56_insert_bit:
169149 ; LE: @ BB#0:
170 ; LE-NEXT: .save {r11, lr}
171 ; LE-NEXT: push {r11, lr}
172 ; LE-NEXT: mov r3, r0
173 ; LE-NEXT: ldr lr, [r0]
174 ; LE-NEXT: ldrh r12, [r3, #4]!
175 ; LE-NEXT: ldrb r2, [r3, #2]
176 ; LE-NEXT: strb r2, [r3, #2]
177 ; LE-NEXT: bic r2, lr, #8192
150 ; LE-NEXT: ldr r2, [r0]
151 ; LE-NEXT: bic r2, r2, #8192
178152 ; LE-NEXT: orr r1, r2, r1, lsl #13
179153 ; LE-NEXT: str r1, [r0]
180 ; LE-NEXT: strh r12, [r3]
181 ; LE-NEXT: pop {r11, lr}
182154 ; LE-NEXT: mov pc, lr
183155 ;
184156 ; BE-LABEL: i56_insert_bit:
185157 ; BE: @ BB#0:
186158 ; BE-NEXT: .save {r11, lr}
187159 ; BE-NEXT: push {r11, lr}
188 ; BE-NEXT: mov r3, r0
160 ; BE-NEXT: mov r2, r0
161 ; BE-NEXT: ldrh r12, [r2, #4]!
162 ; BE-NEXT: ldrb r3, [r2, #2]
163 ; BE-NEXT: strb r3, [r2, #2]
164 ; BE-NEXT: orr r12, r3, r12, lsl #8
189165 ; BE-NEXT: ldr lr, [r0]
190 ; BE-NEXT: ldrh r12, [r3, #4]!
191 ; BE-NEXT: ldrb r2, [r3, #2]
192 ; BE-NEXT: strb r2, [r3, #2]
193 ; BE-NEXT: orr r2, r2, r12, lsl #8
194 ; BE-NEXT: orr r2, r2, lr, lsl #24
195 ; BE-NEXT: bic r2, r2, #8192
196 ; BE-NEXT: orr r1, r2, r1, lsl #13
197 ; BE-NEXT: lsr r2, r1, #8
198 ; BE-NEXT: strh r2, [r3]
166 ; BE-NEXT: orr r3, r12, lr, lsl #24
167 ; BE-NEXT: bic r3, r3, #8192
168 ; BE-NEXT: orr r1, r3, r1, lsl #13
169 ; BE-NEXT: lsr r3, r1, #8
170 ; BE-NEXT: strh r3, [r2]
199171 ; BE-NEXT: bic r2, lr, #255
200172 ; BE-NEXT: orr r1, r2, r1, lsr #24
201173 ; BE-NEXT: str r1, [r0]
55 ; CHECK: movs [[VAL:r[0-9]+]], #42
66 ; CHECK: movt r[[BASE1]], #15
77
8 ; CHECK: str [[VAL]], [r[[BASE1]]]
9 ; CHECK: str [[VAL]], [r[[BASE1]], #24]
10 ; CHECK: str.w [[VAL]], [r[[BASE1]], #42]
8 ; CHECK-DAG: str [[VAL]], [r[[BASE1]]]
9 ; CHECK-DAG: str [[VAL]], [r[[BASE1]], #24]
10 ; CHECK-DAG: str.w [[VAL]], [r[[BASE1]], #42]
1111
1212 ; CHECK: movw r[[BASE2:[0-9]+]], #20394
1313 ; CHECK: movt r[[BASE2]], #18
1212
1313 ; Function Attrs: nounwind uwtable
1414 define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 section "socket1" {
15 ; CHECK: r2 = r10
16 ; CHECK: r2 += -2
17 ; CHECK: r1 = 0
18 ; CHECK: *(u16 *)(r2 + 6) = r1
19 ; CHECK: *(u16 *)(r2 + 4) = r1
20 ; CHECK: *(u16 *)(r2 + 2) = r1
21 ; CHECK: r2 = 6
22 ; CHECK: *(u8 *)(r10 - 7) = r2
23 ; CHECK: r2 = 5
24 ; CHECK: *(u8 *)(r10 - 8) = r2
25 ; CHECK: r2 = 7
26 ; CHECK: *(u8 *)(r10 - 6) = r2
27 ; CHECK: r2 = 8
28 ; CHECK: *(u8 *)(r10 - 5) = r2
29 ; CHECK: r2 = 9
30 ; CHECK: *(u8 *)(r10 - 4) = r2
31 ; CHECK: r2 = 10
32 ; CHECK: *(u8 *)(r10 - 3) = r2
33 ; CHECK: *(u16 *)(r10 + 24) = r1
34 ; CHECK: *(u16 *)(r10 + 22) = r1
35 ; CHECK: *(u16 *)(r10 + 20) = r1
36 ; CHECK: *(u16 *)(r10 + 18) = r1
37 ; CHECK: *(u16 *)(r10 + 16) = r1
38 ; CHECK: *(u16 *)(r10 + 14) = r1
39 ; CHECK: *(u16 *)(r10 + 12) = r1
40 ; CHECK: *(u16 *)(r10 + 10) = r1
41 ; CHECK: *(u16 *)(r10 + 8) = r1
42 ; CHECK: *(u16 *)(r10 + 6) = r1
43 ; CHECK: *(u16 *)(r10 - 2) = r1
44 ; CHECK: *(u16 *)(r10 + 26) = r1
45 ; CHECK: r2 = r10
46 ; CHECK: r2 += -8
47 ; CHECK: r1 = ll
48 ; CHECK: call bpf_map_lookup_elem
49 ; CHECK: exit
1550 %key = alloca %struct.routing_key_2, align 1
1651 %1 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 0
17 ; CHECK: r1 = 5
18 ; CHECK: *(u8 *)(r10 - 8) = r1
1952 store i8 5, i8* %1, align 1
2053 %2 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 1
21 ; CHECK: r1 = 6
22 ; CHECK: *(u8 *)(r10 - 7) = r1
2354 store i8 6, i8* %2, align 1
2455 %3 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 2
25 ; CHECK: r1 = 7
26 ; CHECK: *(u8 *)(r10 - 6) = r1
2756 store i8 7, i8* %3, align 1
2857 %4 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 3
29 ; CHECK: r1 = 8
30 ; CHECK: *(u8 *)(r10 - 5) = r1
3158 store i8 8, i8* %4, align 1
3259 %5 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 4
33 ; CHECK: r1 = 9
34 ; CHECK: *(u8 *)(r10 - 4) = r1
3560 store i8 9, i8* %5, align 1
3661 %6 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 5
37 ; CHECK: r1 = 10
38 ; CHECK: *(u8 *)(r10 - 3) = r1
3962 store i8 10, i8* %6, align 1
4063 %7 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 1, i32 0, i64 0
41 ; CHECK: r1 = r10
42 ; CHECK: r1 += -2
43 ; CHECK: r2 = 0
44 ; CHECK: *(u16 *)(r1 + 6) = r2
45 ; CHECK: *(u16 *)(r1 + 4) = r2
46 ; CHECK: *(u16 *)(r1 + 2) = r2
47 ; CHECK: *(u16 *)(r10 + 24) = r2
48 ; CHECK: *(u16 *)(r10 + 22) = r2
49 ; CHECK: *(u16 *)(r10 + 20) = r2
50 ; CHECK: *(u16 *)(r10 + 18) = r2
51 ; CHECK: *(u16 *)(r10 + 16) = r2
52 ; CHECK: *(u16 *)(r10 + 14) = r2
53 ; CHECK: *(u16 *)(r10 + 12) = r2
54 ; CHECK: *(u16 *)(r10 + 10) = r2
55 ; CHECK: *(u16 *)(r10 + 8) = r2
56 ; CHECK: *(u16 *)(r10 + 6) = r2
57 ; CHECK: *(u16 *)(r10 - 2) = r2
58 ; CHECK: *(u16 *)(r10 + 26) = r2
5964 call void @llvm.memset.p0i8.i64(i8* %7, i8 0, i64 30, i32 1, i1 false)
6065 %8 = call i32 (%struct.bpf_map_def*, %struct.routing_key_2*, ...) bitcast (i32 (...)* @bpf_map_lookup_elem to i32 (%struct.bpf_map_def*, %struct.routing_key_2*, ...)*)(%struct.bpf_map_def* nonnull @routing, %struct.routing_key_2* nonnull %key) #3
6166 ret i32 undef
None ; RUN: llc -march=msp430 -combiner-alias-analysis < %s | FileCheck %s
0 ; RUN: llc -march=msp430 < %s | FileCheck %s
11 target datalayout = "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"
22 target triple = "msp430-generic-generic"
33 @foo = common global i16 0, align 2
6262 ; NEW-DAG: sd $5, 16([[R2]])
6363
6464 ; O32 has run out of argument registers and starts using the stack
65 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 24($sp)
66 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 28($sp)
65 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 16($sp)
66 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 20($sp)
6767 ; O32-DAG: sw [[R3]], 24([[R2]])
6868 ; O32-DAG: sw [[R4]], 28([[R2]])
6969 ; NEW-DAG: sd $6, 24([[R2]])
7070
71 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 32($sp)
72 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 36($sp)
71 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 24($sp)
72 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 28($sp)
7373 ; O32-DAG: sw [[R3]], 32([[R2]])
7474 ; O32-DAG: sw [[R4]], 36([[R2]])
7575 ; NEW-DAG: sd $7, 32([[R2]])
7676
77 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 40($sp)
78 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 44($sp)
77 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 32($sp)
78 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 36($sp)
7979 ; O32-DAG: sw [[R3]], 40([[R2]])
8080 ; O32-DAG: sw [[R4]], 44([[R2]])
8181 ; NEW-DAG: sd $8, 40([[R2]])
8282
83 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 48($sp)
84 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 52($sp)
83 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 40($sp)
84 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 44($sp)
8585 ; O32-DAG: sw [[R3]], 48([[R2]])
8686 ; O32-DAG: sw [[R4]], 52([[R2]])
8787 ; NEW-DAG: sd $9, 48([[R2]])
8888
89 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 56($sp)
90 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 60($sp)
89 ; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 48($sp)
90 ; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 52($sp)
9191 ; O32-DAG: sw [[R3]], 56([[R2]])
9292 ; O32-DAG: sw [[R4]], 60([[R2]])
9393 ; NEW-DAG: sd $10, 56([[R2]])
9494
9595 ; N32/N64 have run out of registers and starts using the stack too
96 ; O32-DAG: lw [[R3:\$[0-9]+]], 64($sp)
97 ; O32-DAG: lw [[R4:\$[0-9]+]], 68($sp)
96 ; O32-DAG: lw [[R3:\$[0-9]+]], 56($sp)
97 ; O32-DAG: lw [[R4:\$[0-9]+]], 60($sp)
9898 ; O32-DAG: sw [[R3]], 64([[R2]])
9999 ; O32-DAG: sw [[R4]], 68([[R2]])
100100 ; NEW-DAG: ld [[R3:\$[0-9]+]], 0($sp)
279279 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
280280 ; space.
281281 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 12
282 ; O32-DAG: sw [[VA]], 0([[SP]])
283
284 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
285 ; N32-DAG: sw [[VA]], 0([[SP]])
286
287 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
288 ; N64-DAG: sd [[VA]], 0([[SP]])
289
290 ; Store [[VA]]
291 ; O32-DAG: sw [[VA]], 0([[SP]])
292
293 ; ALL: teqi $zero, 1
294
295 ; Increment [[VA]] (and realign pointer for O32)
296 ; O32: lw [[VA:\$[0-9]+]], 0([[SP]])
297 ; O32-DAG: addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
298 ; O32-DAG: addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
299 ; O32-DAG: and [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
300 ; O32-DAG: ori [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
301 ; O32-DAG: sw [[VA2]], 0([[SP]])
302
303 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
304 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
305 ; N32-DAG: sw [[VA2]], 0([[SP]])
306
307 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
308 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
309 ; N64-DAG: sd [[VA2]], 0([[SP]])
310
311 ; Load the first argument from the variable portion and copy it to the global.
312 ; This has used the stack pointer directly rather than the [[VA]] we just set
313 ; up.
314 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
315 ; order.
316 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
317 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA_TMP2]])
318 ; O32-DAG: sw [[ARG1]], 8([[GV]])
319 ; O32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 4
320 ; O32-DAG: sw [[VA3]], 0([[SP]])
321 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA_TMP2]])
322 ; O32-DAG: sw [[ARG1]], 12([[GV]])
323
324 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
325 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
326 ; NEW-DAG: ld [[ARG1:\$[0-9]+]], 0([[VA]])
327 ; NEW-DAG: sd [[ARG1]], 8([[GV]])
328
329 ; ALL: teqi $zero, 2
330
331 ; Increment [[VA]] again.
332 ; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
333 ; O32: lw [[VA:\$[0-9]+]], 0([[SP]])
334 ; O32-DAG: addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
335 ; O32-DAG: and [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
336 ; O32-DAG: ori [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
337 ; O32-DAG: sw [[VA2]], 0([[SP]])
338
339 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
340 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
341 ; N32-DAG: sw [[VA3]], 0([[SP]])
342
343 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
344 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
345 ; N64-DAG: sd [[VA3]], 0([[SP]])
346
347 ; Load the second argument from the variable portion and copy it to the global.
348 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
349 ; O32-DAG: sw [[ARG2]], 16([[GV]])
350 ; O32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 4
351 ; O32-DAG: sw [[VA3]], 0([[SP]])
352 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA_TMP2]])
353 ; O32-DAG: sw [[ARG2]], 20([[GV]])
354
355 ; NEW-DAG: ld [[ARG2:\$[0-9]+]], 0([[VA2]])
356 ; NEW-DAG: sd [[ARG2]], 16([[GV]])
357
358 %ap = alloca i8*, align 8
359 %ap2 = bitcast i8** %ap to i8*
360 call void @llvm.va_start(i8* %ap2)
361
362 call void asm sideeffect "teqi $$zero, 1", ""()
363 %arg1 = va_arg i8** %ap, i64
364 %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1
365 store volatile i64 %arg1, i64* %e1, align 8
366
367 call void asm sideeffect "teqi $$zero, 2", ""()
368 %arg2 = va_arg i8** %ap, i64
369 %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2
370 store volatile i64 %arg2, i64* %e2, align 8
371
372 call void @llvm.va_end(i8* %ap2)
373
374 ret void
375 }
376
377 define void @fn_i32_dotdotdot_i16(i32 %a, ...) {
378 entry:
379 ; ALL-LABEL: fn_i32_dotdotdot_i16:
380
381 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
382 ; the argument save area (56 bytes).
383 ; O32: addiu [[SP:\$sp]], $sp, -8
384 ; N32: addiu [[SP:\$sp]], $sp, -64
385 ; N64: daddiu [[SP:\$sp]], $sp, -64
386
387 ; Save variable argument portion on the stack
388 ; O32-DAG: sw $7, 20([[SP]])
389 ; O32-DAG: sw $6, 16([[SP]])
390 ; O32-DAG: sw $5, 12([[SP]])
391
392 ; NEW-DAG: sd $11, 56([[SP]])
393 ; NEW-DAG: sd $10, 48([[SP]])
394 ; NEW-DAG: sd $9, 40([[SP]])
395 ; NEW-DAG: sd $8, 32([[SP]])
396 ; NEW-DAG: sd $7, 24([[SP]])
397 ; NEW-DAG: sd $6, 16([[SP]])
398 ; NEW-DAG: sd $5, 8([[SP]])
399
400 ; Initialize variable argument pointer.
401 ; For O32, the offset is 12 due to the 4 bytes used to store local variables,
402 ; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
403 ; fixed argument.
404 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
405 ; space.
406 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 12
407 ; O32-DAG: sw [[VA]], 0([[SP]])
408
409 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
410 ; N32-DAG: sw [[VA]], 0([[SP]])
411
412 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
413 ; N64-DAG: sd [[VA]], 0([[SP]])
414
415 ; Store [[VA]]
416 ; O32-DAG: sw [[VA]], 0([[SP]])
417
418 ; ALL: teqi $zero, 1
419
420 ; Increment [[VA]]
421 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
422 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
423 ; O32-DAG: sw [[VA2]], 0([[SP]])
424
425 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
426 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
427 ; N32-DAG: sw [[VA2]], 0([[SP]])
428
429 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
430 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
431 ; N64-DAG: sd [[VA2]], 0([[SP]])
432
433 ; Load the first argument from the variable portion.
434 ; This has used the stack pointer directly rather than the [[VA]] we just set
435 ; up.
436 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
437 ; order.
438 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
439
440 ; NEW-LE-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
441 ; NEW-BE-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA]])
442
443 ; Copy the arg to the global
444 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
445
446 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
447
448 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
449
450 ; ALL-DAG: sh [[ARG1]], 2([[GV]])
451
452 ; ALL: teqi $zero, 2
453
454 ; Increment [[VA]] again.
455 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
456 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
457 ; O32-DAG: sw [[VA2]], 0([[SP]])
458
459 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
460 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
461 ; N32-DAG: sw [[VA3]], 0([[SP]])
462
463 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
464 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
465 ; N64-DAG: sd [[VA3]], 0([[SP]])
466
467 ; Load the second argument from the variable portion.
468 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
469
470 ; NEW-LE-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA2]])
471 ; NEW-BE-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA2]])
472
473 ; Copy the arg to the global
474 ; ALL-DAG: sh [[ARG2]], 4([[GV]])
475
476 %ap = alloca i8*, align 8
477 %ap2 = bitcast i8** %ap to i8*
478 call void @llvm.va_start(i8* %ap2)
479
480 call void asm sideeffect "teqi $$zero, 1", ""()
481 %arg1 = va_arg i8** %ap, i16
482 %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1
483 store volatile i16 %arg1, i16* %e1, align 2
484
485 call void asm sideeffect "teqi $$zero, 2", ""()
486 %arg2 = va_arg i8** %ap, i16
487 %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2
488 store volatile i16 %arg2, i16* %e2, align 2
489
490 call void @llvm.va_end(i8* %ap2)
491
492 ret void
493 }
494
495 define void @fn_i32_dotdotdot_i32(i32 %a, ...) {
496 entry:
497 ; ALL-LABEL: fn_i32_dotdotdot_i32:
498
499 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
500 ; the argument save area (56 bytes).
501 ; O32: addiu [[SP:\$sp]], $sp, -8
502 ; N32: addiu [[SP:\$sp]], $sp, -64
503 ; N64: daddiu [[SP:\$sp]], $sp, -64
504
505 ; Save variable argument portion on the stack
506 ; O32-DAG: sw $7, 20([[SP]])
507 ; O32-DAG: sw $6, 16([[SP]])
508 ; O32-DAG: sw $5, 12([[SP]])
509
510 ; NEW-DAG: sd $11, 56([[SP]])
511 ; NEW-DAG: sd $10, 48([[SP]])
512 ; NEW-DAG: sd $9, 40([[SP]])
513 ; NEW-DAG: sd $8, 32([[SP]])
514 ; NEW-DAG: sd $7, 24([[SP]])
515 ; NEW-DAG: sd $6, 16([[SP]])
516 ; NEW-DAG: sd $5, 8([[SP]])
517
518 ; Initialize variable argument pointer.
519 ; For O32, the offset is 12 due to the 4 bytes used to store local variables,
520 ; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
521 ; fixed argument.
522 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
523 ; space.
524 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 12
525 ; O32-DAG: sw [[VA]], 0([[SP]])
526
527 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
528 ; N32-DAG: sw [[VA]], 0([[SP]])
529
530 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
531 ; N64-DAG: sd [[VA]], 0([[SP]])
532
533 ; Store [[VA]]
534 ; O32-DAG: sw [[VA]], 0([[SP]])
535
536 ; ALL: teqi $zero, 1
537
538 ; Increment [[VA]]
539 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
540 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
541 ; O32-DAG: sw [[VA2]], 0([[SP]])
542
543 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
544 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
545 ; N32-DAG: sw [[VA2]], 0([[SP]])
546
547 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
548 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
549 ; N64-DAG: sd [[VA2]], 0([[SP]])
550
551 ; Load the first argument from the variable portion.
552 ; This has used the stack pointer directly rather than the [[VA]] we just set
553 ; up.
554 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
555 ; order.
556 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
557
558 ; NEW-LE-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
559 ; NEW-BE-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA]])
560
561 ; Copy the arg to the global
562 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
563
564 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
565
566 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
567
568 ; ALL-DAG: sw [[ARG1]], 4([[GV]])
569
570 ; ALL: teqi $zero, 2
571
572 ; Increment [[VA]] again.
573 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
574 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
575 ; O32-DAG: sw [[VA2]], 0([[SP]])
576
577 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
578 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
579 ; N32-DAG: sw [[VA3]], 0([[SP]])
580
581 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
582 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
583 ; N64-DAG: sd [[VA3]], 0([[SP]])
584
585 ; Load the second argument from the variable portion.
586 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
587
588 ; NEW-LE-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA2]])
589 ; NEW-BE-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA2]])
590
591 ; Copy the arg to the global
592 ; ALL-DAG: sw [[ARG2]], 8([[GV]])
593
594 %ap = alloca i8*, align 8
595 %ap2 = bitcast i8** %ap to i8*
596 call void @llvm.va_start(i8* %ap2)
597
598 call void asm sideeffect "teqi $$zero, 1", ""()
599 %arg1 = va_arg i8** %ap, i32
600 %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1
601 store volatile i32 %arg1, i32* %e1, align 4
602
603 call void asm sideeffect "teqi $$zero, 2", ""()
604 %arg2 = va_arg i8** %ap, i32
605 %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2
606 store volatile i32 %arg2, i32* %e2, align 4
607
608 call void @llvm.va_end(i8* %ap2)
609
610 ret void
611 }
612
613 define void @fn_i32_dotdotdot_i64(i32 %a, ...) {
614 entry:
615 ; ALL-LABEL: fn_i32_dotdotdot_i64:
616
617 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
618 ; the argument save area (56 bytes).
619 ; O32: addiu [[SP:\$sp]], $sp, -8
620 ; N32: addiu [[SP:\$sp]], $sp, -64
621 ; N64: daddiu [[SP:\$sp]], $sp, -64
622
623 ; Save variable argument portion on the stack
624 ; O32-DAG: sw $7, 20([[SP]])
625 ; O32-DAG: sw $6, 16([[SP]])
626 ; O32-DAG: sw $5, 12([[SP]])
627
628 ; NEW-DAG: sd $11, 56([[SP]])
629 ; NEW-DAG: sd $10, 48([[SP]])
630 ; NEW-DAG: sd $9, 40([[SP]])
631 ; NEW-DAG: sd $8, 32([[SP]])
632 ; NEW-DAG: sd $7, 24([[SP]])
633 ; NEW-DAG: sd $6, 16([[SP]])
634 ; NEW-DAG: sd $5, 8([[SP]])
635
636 ; Initialize variable argument pointer.
637 ; For O32, the offset is 12 due to the 4 bytes used to store local variables,
638 ; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
639 ; fixed argument.
640 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
641 ; space.
642 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 12
643 ; O32-DAG: sw [[VA]], 0([[SP]])
644
645 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
646 ; N32-DAG: sw [[VA]], 0([[SP]])
647
648 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
649 ; N64-DAG: sd [[VA]], 0([[SP]])
650
651 ; Store [[VA]]
652 ; O32-DAG: sw [[VA]], 0([[SP]])
653
654 ; ALL: teqi $zero, 1
655
656 ; Increment [[VA]] (and realign pointer for O32)
657 ; O32: lw [[VA:\$[0-9]+]], 0([[SP]])
658 ; O32-DAG: addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
659 ; O32-DAG: addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
660 ; O32-DAG: and [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
661 ; O32-DAG: ori [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
662 ; O32-DAG: sw [[VA2]], 0([[SP]])
663
664 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
665 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
666 ; N32-DAG: sw [[VA2]], 0([[SP]])
667
668 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
669 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
670 ; N64-DAG: sd [[VA2]], 0([[SP]])
671
672 ; Load the first argument from the variable portion and copy it to the global.
673 ; This has used the stack pointer directly rather than the [[VA]] we just set
674 ; up.
675 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
676 ; order.
677 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
678 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA_TMP2]])
679 ; O32-DAG: sw [[ARG1]], 8([[GV]])
680 ; O32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 4
681 ; O32-DAG: sw [[VA3]], 0([[SP]])
682 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA_TMP2]])
683 ; O32-DAG: sw [[ARG1]], 12([[GV]])
684
685 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
686 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
687 ; NEW-DAG: ld [[ARG1:\$[0-9]+]], 0([[VA]])
688 ; NEW-DAG: sd [[ARG1]], 8([[GV]])
689
690 ; ALL: teqi $zero, 2
691
692 ; Increment [[VA]] again.
693 ; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
694 ; O32: lw [[VA:\$[0-9]+]], 0([[SP]])
695 ; O32-DAG: addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
696 ; O32-DAG: and [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
697 ; O32-DAG: ori [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
698 ; O32-DAG: sw [[VA2]], 0([[SP]])
699
700 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
701 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
702 ; N32-DAG: sw [[VA3]], 0([[SP]])
703
704 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
705 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
706 ; N64-DAG: sd [[VA3]], 0([[SP]])
707
708 ; Load the second argument from the variable portion and copy it to the global.
709 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
710 ; O32-DAG: sw [[ARG2]], 16([[GV]])
711 ; O32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 4
712 ; O32-DAG: sw [[VA2]], 0([[SP]])
713 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA_TMP2]])
714 ; O32-DAG: sw [[ARG2]], 20([[GV]])
715
716 ; NEW-DAG: ld [[ARG2:\$[0-9]+]], 0([[VA2]])
717 ; NEW-DAG: sd [[ARG2]], 16([[GV]])
718
719 %ap = alloca i8*, align 8
720 %ap2 = bitcast i8** %ap to i8*
721 call void @llvm.va_start(i8* %ap2)
722
723 call void asm sideeffect "teqi $$zero, 1", ""()
724 %arg1 = va_arg i8** %ap, i64
725 %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1
726 store volatile i64 %arg1, i64* %e1, align 8
727
728 call void asm sideeffect "teqi $$zero, 2", ""()
729 %arg2 = va_arg i8** %ap, i64
730 %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2
731 store volatile i64 %arg2, i64* %e2, align 8
732
733 call void @llvm.va_end(i8* %ap2)
734
735 ret void
736 }
737
738 define void @fn_i64_dotdotdot_i16(i64 %a, ...) {
739 entry:
740 ; ALL-LABEL: fn_i64_dotdotdot_i16:
741
742 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
743 ; the argument save area (56 bytes).
744 ; O32: addiu [[SP:\$sp]], $sp, -8
745 ; N32: addiu [[SP:\$sp]], $sp, -64
746 ; N64: daddiu [[SP:\$sp]], $sp, -64
747
748 ; Save variable argument portion on the stack
749 ; O32-DAG: sw $7, 20([[SP]])
750 ; O32-DAG: sw $6, 16([[SP]])
751
752 ; NEW-DAG: sd $11, 56([[SP]])
753 ; NEW-DAG: sd $10, 48([[SP]])
754 ; NEW-DAG: sd $9, 40([[SP]])
755 ; NEW-DAG: sd $8, 32([[SP]])
756 ; NEW-DAG: sd $7, 24([[SP]])
757 ; NEW-DAG: sd $6, 16([[SP]])
758 ; NEW-DAG: sd $5, 8([[SP]])
759
760 ; Initialize variable argument pointer.
761 ; For O32, the offset is 16 due to the 4 bytes used to store local variables,
762 ; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
763 ; first fixed argument.
764 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
765 ; space.
766 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 16
767 ; O32-DAG: sw [[VA]], 0([[SP]])
768
769 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
770 ; N32-DAG: sw [[VA]], 0([[SP]])
771
772 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
773 ; N64-DAG: sd [[VA]], 0([[SP]])
774
775 ; Store [[VA]]
776 ; O32-DAG: sw [[VA]], 0([[SP]])
777
778 ; ALL: teqi $zero, 1
779
780 ; Increment [[VA]]
781 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
782 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
783 ; O32-DAG: sw [[VA2]], 0([[SP]])
784
785 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
786 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
787 ; N32-DAG: sw [[VA2]], 0([[SP]])
788
789 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
790 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
791 ; N64-DAG: sd [[VA2]], 0([[SP]])
792
793 ; Load the first argument from the variable portion.
794 ; This has used the stack pointer directly rather than the [[VA]] we just set
795 ; up.
796 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
797 ; order.
798 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
799
800 ; NEW-LE-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
801 ; NEW-BE-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA]])
802
803 ; Copy the arg to the global
804 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
805
806 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
807
808 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
809
810 ; ALL-DAG: sh [[ARG1]], 2([[GV]])
811
812 ; ALL: teqi $zero, 2
813
814 ; Increment [[VA]] again.
815 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
816 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
817 ; O32-DAG: sw [[VA2]], 0([[SP]])
818
819 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
820 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
821 ; N32-DAG: sw [[VA3]], 0([[SP]])
822
823 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
824 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
825 ; N64-DAG: sd [[VA3]], 0([[SP]])
826
827 ; Load the second argument from the variable portion.
828 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
829
830 ; NEW-LE-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA2]])
831 ; NEW-BE-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA2]])
832
833 ; Copy the arg to the global
834 ; ALL-DAG: sh [[ARG2]], 4([[GV]])
835
836 %ap = alloca i8*, align 8
837 %ap2 = bitcast i8** %ap to i8*
838 call void @llvm.va_start(i8* %ap2)
839
840 call void asm sideeffect "teqi $$zero, 1", ""()
841 %arg1 = va_arg i8** %ap, i16
842 %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1
843 store volatile i16 %arg1, i16* %e1, align 2
844
845 call void asm sideeffect "teqi $$zero, 2", ""()
846 %arg2 = va_arg i8** %ap, i16
847 %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2
848 store volatile i16 %arg2, i16* %e2, align 2
849
850 call void @llvm.va_end(i8* %ap2)
851
852 ret void
853 }
854
855 define void @fn_i64_dotdotdot_i32(i64 %a, ...) {
856 entry:
857 ; ALL-LABEL: fn_i64_dotdotdot_i32:
858
859 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
860 ; the argument save area (56 bytes).
861 ; O32: addiu [[SP:\$sp]], $sp, -8
862 ; N32: addiu [[SP:\$sp]], $sp, -64
863 ; N64: daddiu [[SP:\$sp]], $sp, -64
864
865 ; Save variable argument portion on the stack
866 ; O32-DAG: sw $7, 20([[SP]])
867 ; O32-DAG: sw $6, 16([[SP]])
868
869 ; NEW-DAG: sd $11, 56([[SP]])
870 ; NEW-DAG: sd $10, 48([[SP]])
871 ; NEW-DAG: sd $9, 40([[SP]])
872 ; NEW-DAG: sd $8, 32([[SP]])
873 ; NEW-DAG: sd $7, 24([[SP]])
874 ; NEW-DAG: sd $6, 16([[SP]])
875 ; NEW-DAG: sd $5, 8([[SP]])
876
877 ; Initialize variable argument pointer.
878 ; For O32, the offset is 16 due to the 4 bytes used to store local variables,
879 ; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
880 ; first fixed argument.
881 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
882 ; space.
883 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 16
884 ; O32-DAG: sw [[VA]], 0([[SP]])
885
886 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
887 ; N32-DAG: sw [[VA]], 0([[SP]])
888
889 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
890 ; N64-DAG: sd [[VA]], 0([[SP]])
891
892 ; Store [[VA]]
893 ; O32-DAG: sw [[VA]], 0([[SP]])
894
895 ; ALL: teqi $zero, 1
896
897 ; Increment [[VA]]
898 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
899 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
900 ; O32-DAG: sw [[VA2]], 0([[SP]])
901
902 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
903 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
904 ; N32-DAG: sw [[VA2]], 0([[SP]])
905
906 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
907 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
908 ; N64-DAG: sd [[VA2]], 0([[SP]])
909
910 ; Load the first argument from the variable portion.
911 ; This has used the stack pointer directly rather than the [[VA]] we just set
912 ; up.
913 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
914 ; order.
915 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
916
917 ; NEW-LE-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
918 ; NEW-BE-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA]])
919
920 ; Copy the arg to the global
921 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
922
923 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
924
925 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
926
927 ; ALL-DAG: sw [[ARG1]], 4([[GV]])
928
929 ; ALL: teqi $zero, 2
930
931 ; Increment [[VA]] again.
932 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
933 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
934 ; O32-DAG: sw [[VA2]], 0([[SP]])
935
936 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
937 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
938 ; N32-DAG: sw [[VA3]], 0([[SP]])
939
940 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
941 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
942 ; N64-DAG: sd [[VA3]], 0([[SP]])
943
944 ; Load the second argument from the variable portion.
945 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
946
947 ; NEW-LE-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA2]])
948 ; NEW-BE-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA2]])
949
950 ; Copy the arg to the global
951 ; ALL-DAG: sw [[ARG2]], 8([[GV]])
952
953 %ap = alloca i8*, align 8
954 %ap2 = bitcast i8** %ap to i8*
955 call void @llvm.va_start(i8* %ap2)
956
957 call void asm sideeffect "teqi $$zero, 1", ""()
958 %arg1 = va_arg i8** %ap, i32
959 %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1
960 store volatile i32 %arg1, i32* %e1, align 4
961
962 call void asm sideeffect "teqi $$zero, 2", ""()
963 %arg2 = va_arg i8** %ap, i32
964 %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2
965 store volatile i32 %arg2, i32* %e2, align 4
966
967 call void @llvm.va_end(i8* %ap2)
968
969 ret void
970 }
971
972 define void @fn_i64_dotdotdot_i64(i64 %a, ...) {
973 entry:
974 ; ALL-LABEL: fn_i64_dotdotdot_i64:
975
976 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
977 ; the argument save area (56 bytes).
978 ; O32: addiu [[SP:\$sp]], $sp, -8
979 ; N32: addiu [[SP:\$sp]], $sp, -64
980 ; N64: daddiu [[SP:\$sp]], $sp, -64
981
982 ; Save variable argument portion on the stack
983 ; O32-DAG: sw $7, 20([[SP]])
984 ; O32-DAG: sw $6, 16([[SP]])
985
986 ; NEW-DAG: sd $11, 56([[SP]])
987 ; NEW-DAG: sd $10, 48([[SP]])
988 ; NEW-DAG: sd $9, 40([[SP]])
989 ; NEW-DAG: sd $8, 32([[SP]])
990 ; NEW-DAG: sd $7, 24([[SP]])
991 ; NEW-DAG: sd $6, 16([[SP]])
992 ; NEW-DAG: sd $5, 8([[SP]])
993
994 ; Initialize variable argument pointer.
995 ; For O32, the offset is 16 due to the 4 bytes used to store local variables,
996 ; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
997 ; first fixed argument.
998 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
999 ; space.
1000 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 16
2821001 ; O32-DAG: sw [[VA]], 0([[SP]])
2831002
2841003 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
3161035 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
3171036 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
3181037 ; O32-DAG: sw [[ARG1]], 8([[GV]])
319 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
320 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
321 ; O32-DAG: sw [[VA2]], 0([[SP]])
322 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
1038 ; O32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 4
1039 ; O32-DAG: sw [[VA3]], 0([[SP]])
1040 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA_TMP2]])
3231041 ; O32-DAG: sw [[ARG1]], 12([[GV]])
3241042
3251043 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
3481066 ; Load the second argument from the variable portion and copy it to the global.
3491067 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
3501068 ; O32-DAG: sw [[ARG2]], 16([[GV]])
351 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
352 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
353 ; O32-DAG: sw [[VA2]], 0([[SP]])
354 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
1069 ; O32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 4
1070 ; O32-DAG: sw [[VA3]], 0([[SP]])
1071 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA_TMP2]])
3551072 ; O32-DAG: sw [[ARG2]], 20([[GV]])
3561073
3571074 ; NEW-DAG: ld [[ARG2:\$[0-9]+]], 0([[VA2]])
3761093 ret void
3771094 }
3781095
379 define void @fn_i32_dotdotdot_i16(i32 %a, ...) {
380 entry:
381 ; ALL-LABEL: fn_i32_dotdotdot_i16:
382
383 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
384 ; the argument save area (56 bytes).
385 ; O32: addiu [[SP:\$sp]], $sp, -8
386 ; N32: addiu [[SP:\$sp]], $sp, -64
387 ; N64: daddiu [[SP:\$sp]], $sp, -64
388
389 ; Save variable argument portion on the stack
390 ; O32-DAG: sw $7, 20([[SP]])
391 ; O32-DAG: sw $6, 16([[SP]])
392 ; O32-DAG: sw $5, 12([[SP]])
393
394 ; NEW-DAG: sd $11, 56([[SP]])
395 ; NEW-DAG: sd $10, 48([[SP]])
396 ; NEW-DAG: sd $9, 40([[SP]])
397 ; NEW-DAG: sd $8, 32([[SP]])
398 ; NEW-DAG: sd $7, 24([[SP]])
399 ; NEW-DAG: sd $6, 16([[SP]])
400 ; NEW-DAG: sd $5, 8([[SP]])
401
402 ; Initialize variable argument pointer.
403 ; For O32, the offset is 12 due to the 4 bytes used to store local variables,
404 ; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
405 ; fixed argument.
406 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
407 ; space.
408 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 12
409 ; O32-DAG: sw [[VA]], 0([[SP]])
410
411 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
412 ; N32-DAG: sw [[VA]], 0([[SP]])
413
414 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
415 ; N64-DAG: sd [[VA]], 0([[SP]])
416
417 ; Store [[VA]]
418 ; O32-DAG: sw [[VA]], 0([[SP]])
419
420 ; ALL: teqi $zero, 1
421
422 ; Increment [[VA]]
423 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
424 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
425 ; O32-DAG: sw [[VA2]], 0([[SP]])
426
427 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
428 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
429 ; N32-DAG: sw [[VA2]], 0([[SP]])
430
431 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
432 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
433 ; N64-DAG: sd [[VA2]], 0([[SP]])
434
435 ; Load the first argument from the variable portion.
436 ; This has used the stack pointer directly rather than the [[VA]] we just set
437 ; up.
438 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
439 ; order.
440 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
441
442 ; NEW-LE-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
443 ; NEW-BE-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA]])
444
445 ; Copy the arg to the global
446 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
447
448 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
449
450 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
451
452 ; ALL-DAG: sh [[ARG1]], 2([[GV]])
453
454 ; ALL: teqi $zero, 2
455
456 ; Increment [[VA]] again.
457 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
458 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
459 ; O32-DAG: sw [[VA2]], 0([[SP]])
460
461 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
462 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
463 ; N32-DAG: sw [[VA3]], 0([[SP]])
464
465 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
466 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
467 ; N64-DAG: sd [[VA3]], 0([[SP]])
468
469 ; Load the second argument from the variable portion.
470 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
471
472 ; NEW-LE-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA2]])
473 ; NEW-BE-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA2]])
474
475 ; Copy the arg to the global
476 ; ALL-DAG: sh [[ARG2]], 4([[GV]])
477
478 %ap = alloca i8*, align 8
479 %ap2 = bitcast i8** %ap to i8*
480 call void @llvm.va_start(i8* %ap2)
481
482 call void asm sideeffect "teqi $$zero, 1", ""()
483 %arg1 = va_arg i8** %ap, i16
484 %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1
485 store volatile i16 %arg1, i16* %e1, align 2
486
487 call void asm sideeffect "teqi $$zero, 2", ""()
488 %arg2 = va_arg i8** %ap, i16
489 %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2
490 store volatile i16 %arg2, i16* %e2, align 2
491
492 call void @llvm.va_end(i8* %ap2)
493
494 ret void
495 }
496
497 define void @fn_i32_dotdotdot_i32(i32 %a, ...) {
498 entry:
499 ; ALL-LABEL: fn_i32_dotdotdot_i32:
500
501 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
502 ; the argument save area (56 bytes).
503 ; O32: addiu [[SP:\$sp]], $sp, -8
504 ; N32: addiu [[SP:\$sp]], $sp, -64
505 ; N64: daddiu [[SP:\$sp]], $sp, -64
506
507 ; Save variable argument portion on the stack
508 ; O32-DAG: sw $7, 20([[SP]])
509 ; O32-DAG: sw $6, 16([[SP]])
510 ; O32-DAG: sw $5, 12([[SP]])
511
512 ; NEW-DAG: sd $11, 56([[SP]])
513 ; NEW-DAG: sd $10, 48([[SP]])
514 ; NEW-DAG: sd $9, 40([[SP]])
515 ; NEW-DAG: sd $8, 32([[SP]])
516 ; NEW-DAG: sd $7, 24([[SP]])
517 ; NEW-DAG: sd $6, 16([[SP]])
518 ; NEW-DAG: sd $5, 8([[SP]])
519
520 ; Initialize variable argument pointer.
521 ; For O32, the offset is 12 due to the 4 bytes used to store local variables,
522 ; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
523 ; fixed argument.
524 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
525 ; space.
526 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 12
527 ; O32-DAG: sw [[VA]], 0([[SP]])
528
529 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
530 ; N32-DAG: sw [[VA]], 0([[SP]])
531
532 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
533 ; N64-DAG: sd [[VA]], 0([[SP]])
534
535 ; Store [[VA]]
536 ; O32-DAG: sw [[VA]], 0([[SP]])
537
538 ; ALL: teqi $zero, 1
539
540 ; Increment [[VA]]
541 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
542 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
543 ; O32-DAG: sw [[VA2]], 0([[SP]])
544
545 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
546 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
547 ; N32-DAG: sw [[VA2]], 0([[SP]])
548
549 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
550 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
551 ; N64-DAG: sd [[VA2]], 0([[SP]])
552
553 ; Load the first argument from the variable portion.
554 ; This has used the stack pointer directly rather than the [[VA]] we just set
555 ; up.
556 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
557 ; order.
558 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
559
560 ; NEW-LE-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
561 ; NEW-BE-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA]])
562
563 ; Copy the arg to the global
564 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
565
566 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
567
568 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
569
570 ; ALL-DAG: sw [[ARG1]], 4([[GV]])
571
572 ; ALL: teqi $zero, 2
573
574 ; Increment [[VA]] again.
575 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
576 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
577 ; O32-DAG: sw [[VA2]], 0([[SP]])
578
579 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
580 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
581 ; N32-DAG: sw [[VA3]], 0([[SP]])
582
583 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
584 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
585 ; N64-DAG: sd [[VA3]], 0([[SP]])
586
587 ; Load the second argument from the variable portion.
588 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
589
590 ; NEW-LE-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA2]])
591 ; NEW-BE-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA2]])
592
593 ; Copy the arg to the global
594 ; ALL-DAG: sw [[ARG2]], 8([[GV]])
595
596 %ap = alloca i8*, align 8
597 %ap2 = bitcast i8** %ap to i8*
598 call void @llvm.va_start(i8* %ap2)
599
600 call void asm sideeffect "teqi $$zero, 1", ""()
601 %arg1 = va_arg i8** %ap, i32
602 %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1
603 store volatile i32 %arg1, i32* %e1, align 4
604
605 call void asm sideeffect "teqi $$zero, 2", ""()
606 %arg2 = va_arg i8** %ap, i32
607 %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2
608 store volatile i32 %arg2, i32* %e2, align 4
609
610 call void @llvm.va_end(i8* %ap2)
611
612 ret void
613 }
614
615 define void @fn_i32_dotdotdot_i64(i32 %a, ...) {
616 entry:
617 ; ALL-LABEL: fn_i32_dotdotdot_i64:
618
619 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
620 ; the argument save area (56 bytes).
621 ; O32: addiu [[SP:\$sp]], $sp, -8
622 ; N32: addiu [[SP:\$sp]], $sp, -64
623 ; N64: daddiu [[SP:\$sp]], $sp, -64
624
625 ; Save variable argument portion on the stack
626 ; O32-DAG: sw $7, 20([[SP]])
627 ; O32-DAG: sw $6, 16([[SP]])
628 ; O32-DAG: sw $5, 12([[SP]])
629
630 ; NEW-DAG: sd $11, 56([[SP]])
631 ; NEW-DAG: sd $10, 48([[SP]])
632 ; NEW-DAG: sd $9, 40([[SP]])
633 ; NEW-DAG: sd $8, 32([[SP]])
634 ; NEW-DAG: sd $7, 24([[SP]])
635 ; NEW-DAG: sd $6, 16([[SP]])
636 ; NEW-DAG: sd $5, 8([[SP]])
637
638 ; Initialize variable argument pointer.
639 ; For O32, the offset is 12 due to the 4 bytes used to store local variables,
640 ; 4 bytes padding to maintain stack alignment, and the 4 byte slot for the first
641 ; fixed argument.
642 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
643 ; space.
644 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 12
645 ; O32-DAG: sw [[VA]], 0([[SP]])
646
647 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
648 ; N32-DAG: sw [[VA]], 0([[SP]])
649
650 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
651 ; N64-DAG: sd [[VA]], 0([[SP]])
652
653 ; Store [[VA]]
654 ; O32-DAG: sw [[VA]], 0([[SP]])
655
656 ; ALL: teqi $zero, 1
657
658 ; Increment [[VA]] (and realign pointer for O32)
659 ; O32: lw [[VA:\$[0-9]+]], 0([[SP]])
660 ; O32-DAG: addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
661 ; O32-DAG: addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
662 ; O32-DAG: and [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
663 ; O32-DAG: ori [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
664 ; O32-DAG: sw [[VA2]], 0([[SP]])
665
666 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
667 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
668 ; N32-DAG: sw [[VA2]], 0([[SP]])
669
670 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
671 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
672 ; N64-DAG: sd [[VA2]], 0([[SP]])
673
674 ; Load the first argument from the variable portion and copy it to the global.
675 ; This has used the stack pointer directly rather than the [[VA]] we just set
676 ; up.
677 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
678 ; order.
679 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
680 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
681 ; O32-DAG: sw [[ARG1]], 8([[GV]])
682 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
683 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
684 ; O32-DAG: sw [[VA2]], 0([[SP]])
685 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
686 ; O32-DAG: sw [[ARG1]], 12([[GV]])
687
688 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
689 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
690 ; NEW-DAG: ld [[ARG1:\$[0-9]+]], 0([[VA]])
691 ; NEW-DAG: sd [[ARG1]], 8([[GV]])
692
693 ; ALL: teqi $zero, 2
694
695 ; Increment [[VA]] again.
696 ; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
697 ; O32: lw [[VA:\$[0-9]+]], 0([[SP]])
698 ; O32-DAG: addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
699 ; O32-DAG: and [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
700 ; O32-DAG: ori [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
701 ; O32-DAG: sw [[VA2]], 0([[SP]])
702
703 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
704 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
705 ; N32-DAG: sw [[VA3]], 0([[SP]])
706
707 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
708 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
709 ; N64-DAG: sd [[VA3]], 0([[SP]])
710
711 ; Load the second argument from the variable portion and copy it to the global.
712 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
713 ; O32-DAG: sw [[ARG2]], 16([[GV]])
714 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
715 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
716 ; O32-DAG: sw [[VA2]], 0([[SP]])
717 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
718 ; O32-DAG: sw [[ARG2]], 20([[GV]])
719
720 ; NEW-DAG: ld [[ARG2:\$[0-9]+]], 0([[VA2]])
721 ; NEW-DAG: sd [[ARG2]], 16([[GV]])
722
723 %ap = alloca i8*, align 8
724 %ap2 = bitcast i8** %ap to i8*
725 call void @llvm.va_start(i8* %ap2)
726
727 call void asm sideeffect "teqi $$zero, 1", ""()
728 %arg1 = va_arg i8** %ap, i64
729 %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1
730 store volatile i64 %arg1, i64* %e1, align 8
731
732 call void asm sideeffect "teqi $$zero, 2", ""()
733 %arg2 = va_arg i8** %ap, i64
734 %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2
735 store volatile i64 %arg2, i64* %e2, align 8
736
737 call void @llvm.va_end(i8* %ap2)
738
739 ret void
740 }
741
742 define void @fn_i64_dotdotdot_i16(i64 %a, ...) {
743 entry:
744 ; ALL-LABEL: fn_i64_dotdotdot_i16:
745
746 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
747 ; the argument save area (56 bytes).
748 ; O32: addiu [[SP:\$sp]], $sp, -8
749 ; N32: addiu [[SP:\$sp]], $sp, -64
750 ; N64: daddiu [[SP:\$sp]], $sp, -64
751
752 ; Save variable argument portion on the stack
753 ; O32-DAG: sw $7, 20([[SP]])
754 ; O32-DAG: sw $6, 16([[SP]])
755
756 ; NEW-DAG: sd $11, 56([[SP]])
757 ; NEW-DAG: sd $10, 48([[SP]])
758 ; NEW-DAG: sd $9, 40([[SP]])
759 ; NEW-DAG: sd $8, 32([[SP]])
760 ; NEW-DAG: sd $7, 24([[SP]])
761 ; NEW-DAG: sd $6, 16([[SP]])
762 ; NEW-DAG: sd $5, 8([[SP]])
763
764 ; Initialize variable argument pointer.
765 ; For O32, the offset is 16 due to the 4 bytes used to store local variables,
766 ; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
767 ; first fixed argument.
768 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
769 ; space.
770 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 16
771 ; O32-DAG: sw [[VA]], 0([[SP]])
772
773 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
774 ; N32-DAG: sw [[VA]], 0([[SP]])
775
776 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
777 ; N64-DAG: sd [[VA]], 0([[SP]])
778
779 ; Store [[VA]]
780 ; O32-DAG: sw [[VA]], 0([[SP]])
781
782 ; ALL: teqi $zero, 1
783
784 ; Increment [[VA]]
785 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
786 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
787 ; O32-DAG: sw [[VA2]], 0([[SP]])
788
789 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
790 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
791 ; N32-DAG: sw [[VA2]], 0([[SP]])
792
793 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
794 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
795 ; N64-DAG: sd [[VA2]], 0([[SP]])
796
797 ; Load the first argument from the variable portion.
798 ; This has used the stack pointer directly rather than the [[VA]] we just set
799 ; up.
800 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
801 ; order.
802 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
803
804 ; NEW-LE-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
805 ; NEW-BE-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA]])
806
807 ; Copy the arg to the global
808 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
809
810 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
811
812 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(hwords)
813
814 ; ALL-DAG: sh [[ARG1]], 2([[GV]])
815
816 ; ALL: teqi $zero, 2
817
818 ; Increment [[VA]] again.
819 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
820 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
821 ; O32-DAG: sw [[VA2]], 0([[SP]])
822
823 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
824 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
825 ; N32-DAG: sw [[VA3]], 0([[SP]])
826
827 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
828 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
829 ; N64-DAG: sd [[VA3]], 0([[SP]])
830
831 ; Load the second argument from the variable portion.
832 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
833
834 ; NEW-LE-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA2]])
835 ; NEW-BE-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA2]])
836
837 ; Copy the arg to the global
838 ; ALL-DAG: sh [[ARG2]], 4([[GV]])
839
840 %ap = alloca i8*, align 8
841 %ap2 = bitcast i8** %ap to i8*
842 call void @llvm.va_start(i8* %ap2)
843
844 call void asm sideeffect "teqi $$zero, 1", ""()
845 %arg1 = va_arg i8** %ap, i16
846 %e1 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 1
847 store volatile i16 %arg1, i16* %e1, align 2
848
849 call void asm sideeffect "teqi $$zero, 2", ""()
850 %arg2 = va_arg i8** %ap, i16
851 %e2 = getelementptr [3 x i16], [3 x i16]* @hwords, i32 0, i32 2
852 store volatile i16 %arg2, i16* %e2, align 2
853
854 call void @llvm.va_end(i8* %ap2)
855
856 ret void
857 }
858
859 define void @fn_i64_dotdotdot_i32(i64 %a, ...) {
860 entry:
861 ; ALL-LABEL: fn_i64_dotdotdot_i32:
862
863 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
864 ; the argument save area (56 bytes).
865 ; O32: addiu [[SP:\$sp]], $sp, -8
866 ; N32: addiu [[SP:\$sp]], $sp, -64
867 ; N64: daddiu [[SP:\$sp]], $sp, -64
868
869 ; Save variable argument portion on the stack
870 ; O32-DAG: sw $7, 20([[SP]])
871 ; O32-DAG: sw $6, 16([[SP]])
872
873 ; NEW-DAG: sd $11, 56([[SP]])
874 ; NEW-DAG: sd $10, 48([[SP]])
875 ; NEW-DAG: sd $9, 40([[SP]])
876 ; NEW-DAG: sd $8, 32([[SP]])
877 ; NEW-DAG: sd $7, 24([[SP]])
878 ; NEW-DAG: sd $6, 16([[SP]])
879 ; NEW-DAG: sd $5, 8([[SP]])
880
881 ; Initialize variable argument pointer.
882 ; For O32, the offset is 16 due to the 4 bytes used to store local variables,
883 ; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
884 ; first fixed argument.
885 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
886 ; space.
887 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 16
888 ; O32-DAG: sw [[VA]], 0([[SP]])
889
890 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
891 ; N32-DAG: sw [[VA]], 0([[SP]])
892
893 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
894 ; N64-DAG: sd [[VA]], 0([[SP]])
895
896 ; Store [[VA]]
897 ; O32-DAG: sw [[VA]], 0([[SP]])
898
899 ; ALL: teqi $zero, 1
900
901 ; Increment [[VA]]
902 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
903 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
904 ; O32-DAG: sw [[VA2]], 0([[SP]])
905
906 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
907 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
908 ; N32-DAG: sw [[VA2]], 0([[SP]])
909
910 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
911 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
912 ; N64-DAG: sd [[VA2]], 0([[SP]])
913
914 ; Load the first argument from the variable portion.
915 ; This has used the stack pointer directly rather than the [[VA]] we just set
916 ; up.
917 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
918 ; order.
919 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
920
921 ; NEW-LE-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
922 ; NEW-BE-DAG: lw [[ARG1:\$[0-9]+]], 4([[VA]])
923
924 ; Copy the arg to the global
925 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
926
927 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
928
929 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(words)
930
931 ; ALL-DAG: sw [[ARG1]], 4([[GV]])
932
933 ; ALL: teqi $zero, 2
934
935 ; Increment [[VA]] again.
936 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
937 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
938 ; O32-DAG: sw [[VA2]], 0([[SP]])
939
940 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
941 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
942 ; N32-DAG: sw [[VA3]], 0([[SP]])
943
944 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
945 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
946 ; N64-DAG: sd [[VA3]], 0([[SP]])
947
948 ; Load the second argument from the variable portion.
949 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
950
951 ; NEW-LE-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA2]])
952 ; NEW-BE-DAG: lw [[ARG2:\$[0-9]+]], 4([[VA2]])
953
954 ; Copy the arg to the global
955 ; ALL-DAG: sw [[ARG2]], 8([[GV]])
956
957 %ap = alloca i8*, align 8
958 %ap2 = bitcast i8** %ap to i8*
959 call void @llvm.va_start(i8* %ap2)
960
961 call void asm sideeffect "teqi $$zero, 1", ""()
962 %arg1 = va_arg i8** %ap, i32
963 %e1 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 1
964 store volatile i32 %arg1, i32* %e1, align 4
965
966 call void asm sideeffect "teqi $$zero, 2", ""()
967 %arg2 = va_arg i8** %ap, i32
968 %e2 = getelementptr [3 x i32], [3 x i32]* @words, i32 0, i32 2
969 store volatile i32 %arg2, i32* %e2, align 4
970
971 call void @llvm.va_end(i8* %ap2)
972
973 ret void
974 }
975
976 define void @fn_i64_dotdotdot_i64(i64 %a, ...) {
977 entry:
978 ; ALL-LABEL: fn_i64_dotdotdot_i64:
979
980 ; Set up the stack with an 8-byte local area. N32/N64 must also make room for
981 ; the argument save area (56 bytes).
982 ; O32: addiu [[SP:\$sp]], $sp, -8
983 ; N32: addiu [[SP:\$sp]], $sp, -64
984 ; N64: daddiu [[SP:\$sp]], $sp, -64
985
986 ; Save variable argument portion on the stack
987 ; O32-DAG: sw $7, 20([[SP]])
988 ; O32-DAG: sw $6, 16([[SP]])
989
990 ; NEW-DAG: sd $11, 56([[SP]])
991 ; NEW-DAG: sd $10, 48([[SP]])
992 ; NEW-DAG: sd $9, 40([[SP]])
993 ; NEW-DAG: sd $8, 32([[SP]])
994 ; NEW-DAG: sd $7, 24([[SP]])
995 ; NEW-DAG: sd $6, 16([[SP]])
996 ; NEW-DAG: sd $5, 8([[SP]])
997
998 ; Initialize variable argument pointer.
999 ; For O32, the offset is 16 due to the 4 bytes used to store local variables,
1000 ; 4 bytes padding to maintain stack alignment, and the two 4 byte slots for the
1001 ; first fixed argument.
1002 ; For N32/N64, it is only 8 since the fixed arguments do not reserve stack
1003 ; space.
1004 ; O32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 16
1005 ; O32-DAG: sw [[VA]], 0([[SP]])
1006
1007 ; N32-DAG: addiu [[VA:\$[0-9]+]], [[SP]], 8
1008 ; N32-DAG: sw [[VA]], 0([[SP]])
1009
1010 ; N64-DAG: daddiu [[VA:\$[0-9]+]], [[SP]], 8
1011 ; N64-DAG: sd [[VA]], 0([[SP]])
1012
1013 ; Store [[VA]]
1014 ; O32-DAG: sw [[VA]], 0([[SP]])
1015
1016 ; ALL: teqi $zero, 1
1017
1018 ; Increment [[VA]] (and realign pointer for O32)
1019 ; O32: lw [[VA:\$[0-9]+]], 0([[SP]])
1020 ; O32-DAG: addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
1021 ; O32-DAG: addiu [[VA_TMP1:\$[0-9]+]], $zero, -8
1022 ; O32-DAG: and [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
1023 ; O32-DAG: ori [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
1024 ; O32-DAG: sw [[VA2]], 0([[SP]])
1025
1026 ; N32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
1027 ; N32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
1028 ; N32-DAG: sw [[VA2]], 0([[SP]])
1029
1030 ; N64-DAG: ld [[VA:\$[0-9]+]], 0([[SP]])
1031 ; N64-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 8
1032 ; N64-DAG: sd [[VA2]], 0([[SP]])
1033
1034 ; Load the first argument from the variable portion and copy it to the global.
1035 ; This has used the stack pointer directly rather than the [[VA]] we just set
1036 ; up.
1037 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
1038 ; order.
1039 ; O32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
1040 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
1041 ; O32-DAG: sw [[ARG1]], 8([[GV]])
1042 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
1043 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
1044 ; O32-DAG: sw [[VA2]], 0([[SP]])
1045 ; O32-DAG: lw [[ARG1:\$[0-9]+]], 0([[VA]])
1046 ; O32-DAG: sw [[ARG1]], 12([[GV]])
1047
1048 ; N32-DAG: addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
1049 ; N64-DAG: daddiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
1050 ; NEW-DAG: ld [[ARG1:\$[0-9]+]], 0([[VA]])
1051 ; NEW-DAG: sd [[ARG1]], 8([[GV]])
1052
1053 ; ALL: teqi $zero, 2
1054
1055 ; Increment [[VA]] again.
1056 ; FIXME: We're still aligned from the last one but CodeGen doesn't spot that.
1057 ; O32: lw [[VA:\$[0-9]+]], 0([[SP]])
1058 ; O32-DAG: addiu [[VA_TMP0:\$[0-9]+]], [[VA]], 7
1059 ; O32-DAG: and [[VA_TMP2:\$[0-9]+]], [[VA_TMP0]], [[VA_TMP1]]
1060 ; O32-DAG: ori [[VA2:\$[0-9]+]], [[VA_TMP2]], 4
1061 ; O32-DAG: sw [[VA2]], 0([[SP]])
1062
1063 ; N32-DAG: lw [[VA2:\$[0-9]+]], 0([[SP]])
1064 ; N32-DAG: addiu [[VA3:\$[0-9]+]], [[VA2]], 8
1065 ; N32-DAG: sw [[VA3]], 0([[SP]])
1066
1067 ; N64-DAG: ld [[VA2:\$[0-9]+]], 0([[SP]])
1068 ; N64-DAG: daddiu [[VA3:\$[0-9]+]], [[VA2]], 8
1069 ; N64-DAG: sd [[VA3]], 0([[SP]])
1070
1071 ; Load the second argument from the variable portion and copy it to the global.
1072 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
1073 ; O32-DAG: sw [[ARG2]], 16([[GV]])
1074 ; O32-DAG: lw [[VA:\$[0-9]+]], 0([[SP]])
1075 ; O32-DAG: addiu [[VA2:\$[0-9]+]], [[VA]], 4
1076 ; O32-DAG: sw [[VA2]], 0([[SP]])
1077 ; O32-DAG: lw [[ARG2:\$[0-9]+]], 0([[VA]])
1078 ; O32-DAG: sw [[ARG2]], 20([[GV]])
1079
1080 ; NEW-DAG: ld [[ARG2:\$[0-9]+]], 0([[VA2]])
1081 ; NEW-DAG: sd [[ARG2]], 16([[GV]])
1082
1083 %ap = alloca i8*, align 8
1084 %ap2 = bitcast i8** %ap to i8*
1085 call void @llvm.va_start(i8* %ap2)
1086
1087 call void asm sideeffect "teqi $$zero, 1", ""()
1088 %arg1 = va_arg i8** %ap, i64
1089 %e1 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 1
1090 store volatile i64 %arg1, i64* %e1, align 8
1091
1092 call void asm sideeffect "teqi $$zero, 2", ""()
1093 %arg2 = va_arg i8** %ap, i64
1094 %e2 = getelementptr [3 x i64], [3 x i64]* @dwords, i32 0, i32 2
1095 store volatile i64 %arg2, i64* %e2, align 8
1096
1097 call void @llvm.va_end(i8* %ap2)
1098
1099 ret void
1100 }
1101
11021096 declare void @llvm.va_start(i8*)
11031097 declare void @llvm.va_end(i8*)
131131 define internal fastcc void @callee0(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) nounwind noinline {
132132 entry:
133133 ; CHECK: callee0
134 ; CHECK: sw $4
135 ; CHECK: sw $5
136 ; CHECK: sw $6
137 ; CHECK: sw $7
138 ; CHECK: sw $8
139 ; CHECK: sw $9
140 ; CHECK: sw $10
141 ; CHECK: sw $11
142 ; CHECK: sw $12
143 ; CHECK: sw $13
144 ; CHECK: sw $14
145 ; CHECK: sw $15
146 ; CHECK: sw $24
147 ; CHECK: sw $3
134 ; CHECK-DAG: sw $4
135 ; CHECK-DAG: sw $5
136 ; CHECK-DAG: sw $7
137 ; CHECK-DAG: sw $8
138 ; CHECK-DAG: sw $9
139 ; CHECK-DAG: sw $10
140 ; CHECK-DAG: sw $11
141 ; CHECK-DAG: sw $12
142 ; CHECK-DAG: sw $13
143 ; CHECK-DAG: sw $14
144 ; CHECK-DAG: sw $15
145 ; CHECK-DAG: sw $24
146 ; CHECK-DAG: sw $3
148147
149148 ; t6, t7 and t8 are reserved in NaCl and cannot be used for fastcc.
150149 ; CHECK-NACL-NOT: sw $14