llvm.org GIT mirror llvm / 05d5e21
merge consecutive stores of extracted vector elements (PR21711) This is a 2nd try at the same optimization as http://reviews.llvm.org/D6698. That patch was checked in at r224611, but reverted at r225031 because it caused a failure outside of the regression tests. The cause of the crash was not recognizing consecutive stores that have mixed source values (loads and vector element extracts), so this patch adds a check to bail out if any store value is not coming from a vector element extract. This patch also refactors the shared logic of the constant source and vector extracted elements source cases into a helper function. Differential Revision: http://reviews.llvm.org/D6850 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226845 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 5 years ago
2 changed file(s) with 224 addition(s) and 95 deletion(s). Raw diff Collapse all Expand all
362362 /// chain (aliasing node.)
363363 SDValue FindBetterChain(SDNode *N, SDValue Chain);
364364
365 /// Holds a pointer to an LSBaseSDNode as well as information on where it
366 /// is located in a sequence of memory operations connected by a chain.
367 struct MemOpLink {
368 MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
369 MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
370 // Ptr to the mem node.
371 LSBaseSDNode *MemNode;
372 // Offset from the base ptr.
373 int64_t OffsetFromBase;
374 // What is the sequence number of this mem node.
375 // Lowest mem operand in the DAG starts at zero.
376 unsigned SequenceNum;
377 };
378
379 /// This is a helper function for MergeConsecutiveStores. When the source
380 /// elements of the consecutive stores are all constants or all extracted
381 /// vector elements, try to merge them into one larger store.
382 /// \return True if a merged store was created.
383 bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl &StoreNodes,
384 EVT MemVT, unsigned NumElem,
385 bool IsConstantSrc, bool UseVector);
386
365387 /// Merge consecutive store operations into a wide store.
366388 /// This optimization uses wide integers or vectors when possible.
367389 /// \return True if some memory operations were changed.
97059727 }
97069728 };
97079729
9708 /// Holds a pointer to an LSBaseSDNode as well as information on where it
9709 /// is located in a sequence of memory operations connected by a chain.
9710 struct MemOpLink {
9711 MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
9712 MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
9713 // Ptr to the mem node.
9714 LSBaseSDNode *MemNode;
9715 // Offset from the base ptr.
9716 int64_t OffsetFromBase;
9717 // What is the sequence number of this mem node.
9718 // Lowest mem operand in the DAG starts at zero.
9719 unsigned SequenceNum;
9720 };
9730 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
9731 SmallVectorImpl &StoreNodes, EVT MemVT,
9732 unsigned NumElem, bool IsConstantSrc, bool UseVector) {
9733 // Make sure we have something to merge.
9734 if (NumElem < 2)
9735 return false;
9736
9737 int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
9738 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
9739 unsigned EarliestNodeUsed = 0;
9740
9741 for (unsigned i=0; i < NumElem; ++i) {
9742 // Find a chain for the new wide-store operand. Notice that some
9743 // of the store nodes that we found may not be selected for inclusion
9744 // in the wide store. The chain we use needs to be the chain of the
9745 // earliest store node which is *used* and replaced by the wide store.
9746 if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
9747 EarliestNodeUsed = i;
9748 }
9749
9750 // The earliest Node in the DAG.
9751 LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
9752 SDLoc DL(StoreNodes[0].MemNode);
9753
9754 SDValue StoredVal;
9755 if (UseVector) {
9756 // Find a legal type for the vector store.
9757 EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
9758 assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
9759 if (IsConstantSrc) {
9760 // A vector store with a constant source implies that the constant is
9761 // zero; we only handle merging stores of constant zeros because the zero
9762 // can be materialized without a load.
9763 // It may be beneficial to loosen this restriction to allow non-zero
9764 // store merging.
9765 StoredVal = DAG.getConstant(0, Ty);
9766 } else {
9767 SmallVector Ops;
9768 for (unsigned i = 0; i < NumElem ; ++i) {
9769 StoreSDNode *St = cast(StoreNodes[i].MemNode);
9770 SDValue Val = St->getValue();
9771 // All of the operands of a BUILD_VECTOR must have the same type.
9772 if (Val.getValueType() != MemVT)
9773 return false;
9774 Ops.push_back(Val);
9775 }
9776
9777 // Build the extracted vector elements back into a vector.
9778 StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops);
9779 }
9780 } else {
9781 // We should always use a vector store when merging extracted vector
9782 // elements, so this path implies a store of constants.
9783 assert(IsConstantSrc && "Merged vector elements should use vector store");
9784
9785 unsigned StoreBW = NumElem * ElementSizeBytes * 8;
9786 APInt StoreInt(StoreBW, 0);
9787
9788 // Construct a single integer constant which is made of the smaller
9789 // constant inputs.
9790 bool IsLE = TLI.isLittleEndian();
9791 for (unsigned i = 0; i < NumElem ; ++i) {
9792 unsigned Idx = IsLE ? (NumElem - 1 - i) : i;
9793 StoreSDNode *St = cast(StoreNodes[Idx].MemNode);
9794 SDValue Val = St->getValue();
9795 StoreInt <<= ElementSizeBytes*8;
9796 if (ConstantSDNode *C = dyn_cast(Val)) {
9797 StoreInt |= C->getAPIntValue().zext(StoreBW);
9798 } else if (ConstantFPSDNode *C = dyn_cast(Val)) {
9799 StoreInt |= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
9800 } else {
9801 llvm_unreachable("Invalid constant element type");
9802 }
9803 }
9804
9805 // Create the new Load and Store operations.
9806 EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
9807 StoredVal = DAG.getConstant(StoreInt, StoreTy);
9808 }
9809
9810 SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
9811 FirstInChain->getBasePtr(),
9812 FirstInChain->getPointerInfo(),
9813 false, false,
9814 FirstInChain->getAlignment());
9815
9816 // Replace the first store with the new store
9817 CombineTo(EarliestOp, NewStore);
9818 // Erase all other stores.
9819 for (unsigned i = 0; i < NumElem ; ++i) {
9820 if (StoreNodes[i].MemNode == EarliestOp)
9821 continue;
9822 StoreSDNode *St = cast(StoreNodes[i].MemNode);
9823 // ReplaceAllUsesWith will replace all uses that existed when it was
9824 // called, but graph optimizations may cause new ones to appear. For
9825 // example, the case in pr14333 looks like
9826 //
9827 // St's chain -> St -> another store -> X
9828 //
9829 // And the only difference from St to the other store is the chain.
9830 // When we change it's chain to be St's chain they become identical,
9831 // get CSEed and the net result is that X is now a use of St.
9832 // Since we know that St is redundant, just iterate.
9833 while (!St->use_empty())
9834 DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
9835 deleteAndRecombine(St);
9836 }
9837
9838 return true;
9839 }
97219840
97229841 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
97239842 EVT MemVT = St->getMemoryVT();
97309849 return false;
97319850
97329851 // Perform an early exit check. Do not bother looking at stored values that
9733 // are not constants or loads.
9852 // are not constants, loads, or extracted vector elements.
97349853 SDValue StoredVal = St->getValue();
97359854 bool IsLoadSrc = isa(StoredVal);
9736 if (!isa(StoredVal) && !isa(StoredVal) &&
9737 !IsLoadSrc)
9855 bool IsConstantSrc = isa(StoredVal) ||
9856 isa(StoredVal);
9857 bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
9858
9859 if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
97389860 return false;
97399861
97409862 // Only look at ends of store sequences.
98769998 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
98779999
987810000 // Store the constants into memory as one consecutive store.
9879 if (!IsLoadSrc) {
10001 if (IsConstantSrc) {
988010002 unsigned LastLegalType = 0;
988110003 unsigned LastLegalVectorType = 0;
988210004 bool NonZero = false;
992510047 bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
992610048 unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
992710049
9928 // Make sure we have something to merge.
9929 if (NumElem < 2)
9930 return false;
9931
9932 unsigned EarliestNodeUsed = 0;
9933 for (unsigned i=0; i < NumElem; ++i) {
9934 // Find a chain for the new wide-store operand. Notice that some
9935 // of the store nodes that we found may not be selected for inclusion
9936 // in the wide store. The chain we use needs to be the chain of the
9937 // earliest store node which is *used* and replaced by the wide store.
9938 if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
9939 EarliestNodeUsed = i;
9940 }
9941
9942 // The earliest Node in the DAG.
9943 LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
9944 SDLoc DL(StoreNodes[0].MemNode);
9945
9946 SDValue StoredVal;
9947 if (UseVector) {
10050 return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
10051 true, UseVector);
10052 }
10053
10054 // When extracting multiple vector elements, try to store them
10055 // in one vector store rather than a sequence of scalar stores.
10056 if (IsExtractVecEltSrc) {
10057 unsigned NumElem = 0;
10058 for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
10059 StoreSDNode *St = cast(StoreNodes[i].MemNode);
10060 SDValue StoredVal = St->getValue();
10061 // This restriction could be loosened.
10062 // Bail out if any stored values are not elements extracted from a vector.
10063 // It should be possible to handle mixed sources, but load sources need
10064 // more careful handling (see the block of code below that handles
10065 // consecutive loads).
10066 if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10067 return false;
10068
994810069 // Find a legal type for the vector store.
9949 EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
9950 assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
9951 StoredVal = DAG.getConstant(0, Ty);
9952 } else {
9953 unsigned StoreBW = NumElem * ElementSizeBytes * 8;
9954 APInt StoreInt(StoreBW, 0);
9955
9956 // Construct a single integer constant which is made of the smaller
9957 // constant inputs.
9958 bool IsLE = TLI.isLittleEndian();
9959 for (unsigned i = 0; i < NumElem ; ++i) {
9960 unsigned Idx = IsLE ?(NumElem - 1 - i) : i;
9961 StoreSDNode *St = cast(StoreNodes[Idx].MemNode);
9962 SDValue Val = St->getValue();
9963 StoreInt<<=ElementSizeBytes*8;
9964 if (ConstantSDNode *C = dyn_cast(Val)) {
9965 StoreInt|=C->getAPIntValue().zext(StoreBW);
9966 } else if (ConstantFPSDNode *C = dyn_cast(Val)) {
9967 StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
9968 } else {
9969 llvm_unreachable("Invalid constant element type");
9970 }
9971 }
9972
9973 // Create the new Load and Store operations.
9974 EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
9975 StoredVal = DAG.getConstant(StoreInt, StoreTy);
9976 }
9977
9978 SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
9979 FirstInChain->getBasePtr(),
9980 FirstInChain->getPointerInfo(),
9981 false, false,
9982 FirstInChain->getAlignment());
9983
9984 // Replace the first store with the new store
9985 CombineTo(EarliestOp, NewStore);
9986 // Erase all other stores.
9987 for (unsigned i = 0; i < NumElem ; ++i) {
9988 if (StoreNodes[i].MemNode == EarliestOp)
9989 continue;
9990 StoreSDNode *St = cast(StoreNodes[i].MemNode);
9991 // ReplaceAllUsesWith will replace all uses that existed when it was
9992 // called, but graph optimizations may cause new ones to appear. For
9993 // example, the case in pr14333 looks like
9994 //
9995 // St's chain -> St -> another store -> X
9996 //
9997 // And the only difference from St to the other store is the chain.
9998 // When we change it's chain to be St's chain they become identical,
9999 // get CSEed and the net result is that X is now a use of St.
10000 // Since we know that St is redundant, just iterate.
10001 while (!St->use_empty())
10002 DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
10003 deleteAndRecombine(St);
10004 }
10005
10006 return true;
10070 EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
10071 if (TLI.isTypeLegal(Ty))
10072 NumElem = i + 1;
10073 }
10074
10075 return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
10076 false, true);
1000710077 }
1000810078
1000910079 // Below we handle the case of multiple consecutive stores that
433433 ;
434434 ret void
435435 }
436
437 ; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
438 define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
439 %vecext0 = extractelement <8 x float> %v, i32 0
440 %vecext1 = extractelement <8 x float> %v, i32 1
441 %vecext2 = extractelement <8 x float> %v, i32 2
442 %vecext3 = extractelement <8 x float> %v, i32 3
443 %vecext4 = extractelement <8 x float> %v, i32 4
444 %vecext5 = extractelement <8 x float> %v, i32 5
445 %vecext6 = extractelement <8 x float> %v, i32 6
446 %vecext7 = extractelement <8 x float> %v, i32 7
447 %arrayidx1 = getelementptr inbounds float* %ptr, i64 1
448 %arrayidx2 = getelementptr inbounds float* %ptr, i64 2
449 %arrayidx3 = getelementptr inbounds float* %ptr, i64 3
450 %arrayidx4 = getelementptr inbounds float* %ptr, i64 4
451 %arrayidx5 = getelementptr inbounds float* %ptr, i64 5
452 %arrayidx6 = getelementptr inbounds float* %ptr, i64 6
453 %arrayidx7 = getelementptr inbounds float* %ptr, i64 7
454 store float %vecext0, float* %ptr, align 4
455 store float %vecext1, float* %arrayidx1, align 4
456 store float %vecext2, float* %arrayidx2, align 4
457 store float %vecext3, float* %arrayidx3, align 4
458 store float %vecext4, float* %arrayidx4, align 4
459 store float %vecext5, float* %arrayidx5, align 4
460 store float %vecext6, float* %arrayidx6, align 4
461 store float %vecext7, float* %arrayidx7, align 4
462 ret void
463
464 ; CHECK-LABEL: merge_vec_element_store
465 ; CHECK: vmovups
466 ; CHECK-NEXT: vzeroupper
467 ; CHECK-NEXT: retq
468 }
469
470 ; This is a minimized test based on real code that was failing.
471 ; We could merge stores (and loads) like this...
472
473 define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
474 %idx0 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 0
475 %idx1 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 1
476 %idx4 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 4
477 %idx5 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 5
478
479 %a0 = load i64* %idx0, align 8
480 store i64 %a0, i64* %idx4, align 8
481
482 %b = bitcast i64* %idx1 to <2 x i64>*
483 %v = load <2 x i64>* %b, align 8
484 %a1 = extractelement <2 x i64> %v, i32 0
485 store i64 %a1, i64* %idx5, align 8
486 ret void
487
488 ; CHECK-LABEL: merge_vec_element_and_scalar_load
489 ; CHECK: movq (%rdi), %rax
490 ; CHECK-NEXT: movq %rax, 32(%rdi)
491 ; CHECK-NEXT: movq 8(%rdi), %rax
492 ; CHECK-NEXT: movq %rax, 40(%rdi)
493 ; CHECK-NEXT: retq
494 }