llvm.org GIT mirror llvm / 73ae1df
Masked Load / Store Intrinsics - the CodeGen part. I'm recommiting the codegen part of the patch. The vectorizer part will be send to review again. Masked Vector Load and Store Intrinsics. Introduced new target-independent intrinsics in order to support masked vector loads and stores. The loop vectorizer optimizes loops containing conditional memory accesses by generating these intrinsics for existing targets AVX2 and AVX-512. The vectorizer asks the target about availability of masked vector loads and stores. Added SDNodes for masked operations and lowering patterns for X86 code generator. Examples: <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32> %passthru, i32 4 /* align */, <16 x i1> %mask) declare void @llvm.masked.store.v8f64(i8* %addr, <8 x double> %value, i32 4, <8 x i1> %mask) Scalarizer for other targets (not AVX2/AVX-512) will be done in a separate patch. http://reviews.llvm.org/D6191 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@223348 91177308-0d34-0410-b5e6-96231b3b80d8 Elena Demikhovsky 4 years ago
27 changed file(s) with 873 addition(s) and 12 deletion(s). Raw diff Collapse all Expand all
269269 int64_t BaseOffset, bool HasBaseReg,
270270 int64_t Scale) const;
271271
272 /// \brief Return true if the target works with masked instruction
273 /// AVX2 allows masks for consecutive load and store for i32 and i64 elements.
274 /// AVX-512 architecture will also allow masks for non-consecutive memory
275 /// accesses.
276 virtual bool isLegalPredicatedStore(Type *DataType, int Consecutive) const;
277 virtual bool isLegalPredicatedLoad (Type *DataType, int Consecutive) const;
278
272279 /// \brief Return the cost of the scaling factor used in the addressing
273280 /// mode represented by AM for this target, for a load/store
274281 /// of the specified type.
673673 ATOMIC_LOAD_MAX,
674674 ATOMIC_LOAD_UMIN,
675675 ATOMIC_LOAD_UMAX,
676
677 // Masked load and store
678 MLOAD, MSTORE,
676679
677680 /// This corresponds to the llvm.lifetime.* intrinsics. The first operand
678681 /// is the chain and the second operand is the alloca pointer.
865865 SDValue getIndexedStore(SDValue OrigStoe, SDLoc dl, SDValue Base,
866866 SDValue Offset, ISD::MemIndexedMode AM);
867867
868 SDValue getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
869 SDValue Mask, SDValue Src0, MachineMemOperand *MMO);
870 SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
871 SDValue Ptr, SDValue Mask, MachineMemOperand *MMO);
868872 /// getSrcValue - Construct a node to track a Value* through the backend.
869873 SDValue getSrcValue(const Value *v);
870874
11761176 N->getOpcode() == ISD::ATOMIC_LOAD_UMAX ||
11771177 N->getOpcode() == ISD::ATOMIC_LOAD ||
11781178 N->getOpcode() == ISD::ATOMIC_STORE ||
1179 N->getOpcode() == ISD::MLOAD ||
1180 N->getOpcode() == ISD::MSTORE ||
11791181 N->isMemIntrinsic() ||
11801182 N->isTargetMemoryOpcode();
11811183 }
19251927 }
19261928 };
19271929
1930 /// MaskedLoadStoreSDNode - This is a base class is used to represent MLOAD and
1931 /// MSTORE nodes
1932 ///
1933 class MaskedLoadStoreSDNode : public MemSDNode {
1934 // Operands
1935 SDUse Ops[4];
1936 public:
1937 friend class SelectionDAG;
1938 MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl,
1939 SDValue *Operands, unsigned numOperands,
1940 SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
1941 : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
1942 InitOperands(Ops, Operands, numOperands);
1943 }
1944
1945 // In the both nodes address is Op1, mask is Op2:
1946 // MaskedLoadSDNode (Chain, ptr, mask, src0), src0 is a passthru value
1947 // MaskedStoreSDNode (Chain, ptr, mask, data)
1948 // Mask is a vector of i1 elements
1949 const SDValue &getBasePtr() const { return getOperand(1); }
1950 const SDValue &getMask() const { return getOperand(2); }
1951
1952 static bool classof(const SDNode *N) {
1953 return N->getOpcode() == ISD::MLOAD ||
1954 N->getOpcode() == ISD::MSTORE;
1955 }
1956 };
1957
1958 /// MaskedLoadSDNode - This class is used to represent an MLOAD node
1959 ///
1960 class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
1961 public:
1962 friend class SelectionDAG;
1963 MaskedLoadSDNode(unsigned Order, DebugLoc dl,
1964 SDValue *Operands, unsigned numOperands,
1965 SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
1966 : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, Operands, numOperands,
1967 VTs, MemVT, MMO)
1968 {}
1969
1970 const SDValue &getSrc0() const { return getOperand(3); }
1971 static bool classof(const SDNode *N) {
1972 return N->getOpcode() == ISD::MLOAD;
1973 }
1974 };
1975
1976 /// MaskedStoreSDNode - This class is used to represent an MSTORE node
1977 ///
1978 class MaskedStoreSDNode : public MaskedLoadStoreSDNode {
1979
1980 public:
1981 friend class SelectionDAG;
1982 MaskedStoreSDNode(unsigned Order, DebugLoc dl,
1983 SDValue *Operands, unsigned numOperands,
1984 SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
1985 : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, Operands, numOperands,
1986 VTs, MemVT, MMO)
1987 {}
1988
1989 const SDValue &getData() const { return getOperand(3); }
1990
1991 static bool classof(const SDNode *N) {
1992 return N->getOpcode() == ISD::MSTORE;
1993 }
1994 };
1995
19281996 /// MachineSDNode - An SDNode that represents everything that will be needed
19291997 /// to construct a MachineInstr. These nodes are created during the
19301998 /// instruction selection proper phase.
428428 /// If the pointer isn't i8* it will be converted.
429429 CallInst *CreateLifetimeEnd(Value *Ptr, ConstantInt *Size = nullptr);
430430
431 /// \brief Create a call to Masked Load intrinsic
432 CallInst *CreateMaskedLoad(ArrayRef Ops);
433
434 /// \brief Create a call to Masked Store intrinsic
435 CallInst *CreateMaskedStore(ArrayRef Ops);
436
431437 /// \brief Create an assume intrinsic call that allows the optimizer to
432438 /// assume that the provided condition will be true.
433439 CallInst *CreateAssumption(Value *Cond);
434440
435441 private:
442 /// \brief Create a call to a masked intrinsic with given Id.
443 /// Masked intrinsic has only one overloaded type - data type.
444 CallInst *CreateMaskedIntrinsic(unsigned Id, ArrayRef Ops,
445 Type *DataTy);
446
436447 Value *getCastedInt8PtrValue(Value *Ptr);
437448 };
438449
7575 enum IITDescriptorKind {
7676 Void, VarArg, MMX, Metadata, Half, Float, Double,
7777 Integer, Vector, Pointer, Struct,
78 Argument, ExtendArgument, TruncArgument, HalfVecArgument
78 Argument, ExtendArgument, TruncArgument, HalfVecArgument,
79 SameVecWidthArgument
7980 } Kind;
8081
8182 union {
9596 };
9697 unsigned getArgumentNumber() const {
9798 assert(Kind == Argument || Kind == ExtendArgument ||
98 Kind == TruncArgument || Kind == HalfVecArgument);
99 Kind == TruncArgument || Kind == HalfVecArgument ||
100 Kind == SameVecWidthArgument);
99101 return Argument_Info >> 2;
100102 }
101103 ArgKind getArgumentKind() const {
102104 assert(Kind == Argument || Kind == ExtendArgument ||
103 Kind == TruncArgument || Kind == HalfVecArgument);
104 return (ArgKind)(Argument_Info&3);
105 Kind == TruncArgument || Kind == HalfVecArgument ||
106 Kind == SameVecWidthArgument);
107 return (ArgKind)(Argument_Info & 3);
105108 }
106109
107110 static IITDescriptor get(IITDescriptorKind K, unsigned Field) {
111111 // the intrinsic is overloaded, so the matched type should be declared as iAny.
112112 class LLVMExtendedType : LLVMMatchType;
113113 class LLVMTruncatedType : LLVMMatchType;
114 class LLVMVectorSameWidth
115 : LLVMMatchType {
116 ValueType ElTy = elty.VT;
117 }
114118
115119 // Match the type of another intrinsic parameter that is expected to be a
116120 // vector type, but change the element count to be half as many
554558 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
555559 [], "llvm.clear_cache">;
556560
561 //===-------------------------- Masked Intrinsics -------------------------===//
562 //
563 def int_masked_store : Intrinsic<[], [llvm_ptr_ty, llvm_anyvector_ty,
564 llvm_i32_ty,
565 LLVMVectorSameWidth<0, llvm_i1_ty>],
566 [IntrReadWriteArgMem]>;
567
568 def int_masked_load : Intrinsic<[llvm_anyvector_ty],
569 [llvm_ptr_ty, LLVMMatchType<0>, llvm_i32_ty,
570 LLVMVectorSameWidth<0, llvm_i1_ty>],
571 [IntrReadArgMem]>;
557572 //===----------------------------------------------------------------------===//
558573 // Target-specific intrinsics
559574 //===----------------------------------------------------------------------===//
185185
186186 def SDTIStore : SDTypeProfile<1, 3, [ // indexed store
187187 SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3>
188 ]>;
189
190 def SDTMaskedStore: SDTypeProfile<0, 3, [ // masked store
191 SDTCisPtrTy<0>, SDTCisVec<1>, SDTCisVec<2>
192 ]>;
193
194 def SDTMaskedLoad: SDTypeProfile<1, 3, [ // masked load
195 SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>
188196 ]>;
189197
190198 def SDTVecShuffle : SDTypeProfile<1, 2, [
452460 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
453461 def atomic_store : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore,
454462 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
463
464 def masked_store : SDNode<"ISD::MSTORE", SDTMaskedStore,
465 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
466 def masked_load : SDNode<"ISD::MLOAD", SDTMaskedLoad,
467 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
455468
456469 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
457470 // and truncst (see below).
9999 bool TargetTransformInfo::isLegalICmpImmediate(int64_t Imm) const {
100100 return PrevTTI->isLegalICmpImmediate(Imm);
101101 }
102
103 bool TargetTransformInfo::isLegalPredicatedLoad(Type *DataType,
104 int Consecutive) const {
105 return false;
106 }
107
108 bool TargetTransformInfo::isLegalPredicatedStore(Type *DataType,
109 int Consecutive) const {
110 return false;
111 }
112
102113
103114 bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
104115 int64_t BaseOffset,
302302 SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
303303 SDValue visitVECTOR_SHUFFLE(SDNode *N);
304304 SDValue visitINSERT_SUBVECTOR(SDNode *N);
305 SDValue visitMLOAD(SDNode *N);
306 SDValue visitMSTORE(SDNode *N);
305307
306308 SDValue XformToShuffleWithZero(SDNode *N);
307309 SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
13501352 case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N);
13511353 case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N);
13521354 case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N);
1355 case ISD::MLOAD: return visitMLOAD(N);
1356 case ISD::MSTORE: return visitMSTORE(N);
13531357 }
13541358 return SDValue();
13551359 }
47704774 TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
47714775 }
47724776
4777 SDValue DAGCombiner::visitMSTORE(SDNode *N) {
4778
4779 if (Level >= AfterLegalizeTypes)
4780 return SDValue();
4781
4782 MaskedStoreSDNode *MST = dyn_cast(N);
4783 SDValue Mask = MST->getMask();
4784 SDValue Data = MST->getData();
4785 SDLoc DL(N);
4786
4787 // If the MSTORE data type requires splitting and the mask is provided by a
4788 // SETCC, then split both nodes and its operands before legalization. This
4789 // prevents the type legalizer from unrolling SETCC into scalar comparisons
4790 // and enables future optimizations (e.g. min/max pattern matching on X86).
4791 if (Mask.getOpcode() == ISD::SETCC) {
4792
4793 // Check if any splitting is required.
4794 if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
4795 TargetLowering::TypeSplitVector)
4796 return SDValue();
4797
4798 SDValue MaskLo, MaskHi, Lo, Hi;
4799 std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
4800
4801 EVT LoVT, HiVT;
4802 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0));
4803
4804 SDValue Chain = MST->getChain();
4805 SDValue Ptr = MST->getBasePtr();
4806
4807 EVT MemoryVT = MST->getMemoryVT();
4808 unsigned Alignment = MST->getOriginalAlignment();
4809
4810 // if Alignment is equal to the vector size,
4811 // take the half of it for the second part
4812 unsigned SecondHalfAlignment =
4813 (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
4814 Alignment/2 : Alignment;
4815
4816 EVT LoMemVT, HiMemVT;
4817 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
4818
4819 SDValue DataLo, DataHi;
4820 std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
4821
4822 MachineMemOperand *MMO = DAG.getMachineFunction().
4823 getMachineMemOperand(MST->getPointerInfo(),
4824 MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
4825 Alignment, MST->getAAInfo(), MST->getRanges());
4826
4827 Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, MMO);
4828
4829 unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
4830 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
4831 DAG.getConstant(IncrementSize, Ptr.getValueType()));
4832
4833 MMO = DAG.getMachineFunction().
4834 getMachineMemOperand(MST->getPointerInfo(),
4835 MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
4836 SecondHalfAlignment, MST->getAAInfo(),
4837 MST->getRanges());
4838
4839 Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, MMO);
4840
4841 AddToWorklist(Lo.getNode());
4842 AddToWorklist(Hi.getNode());
4843
4844 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
4845 }
4846 return SDValue();
4847 }
4848
4849 SDValue DAGCombiner::visitMLOAD(SDNode *N) {
4850
4851 if (Level >= AfterLegalizeTypes)
4852 return SDValue();
4853
4854 MaskedLoadSDNode *MLD = dyn_cast(N);
4855 SDValue Mask = MLD->getMask();
4856 SDLoc DL(N);
4857
4858 // If the MLOAD result requires splitting and the mask is provided by a
4859 // SETCC, then split both nodes and its operands before legalization. This
4860 // prevents the type legalizer from unrolling SETCC into scalar comparisons
4861 // and enables future optimizations (e.g. min/max pattern matching on X86).
4862
4863 if (Mask.getOpcode() == ISD::SETCC) {
4864 EVT VT = N->getValueType(0);
4865
4866 // Check if any splitting is required.
4867 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
4868 TargetLowering::TypeSplitVector)
4869 return SDValue();
4870
4871 SDValue MaskLo, MaskHi, Lo, Hi;
4872 std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
4873
4874 SDValue Src0 = MLD->getSrc0();
4875 SDValue Src0Lo, Src0Hi;
4876 std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
4877
4878 EVT LoVT, HiVT;
4879 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
4880
4881 SDValue Chain = MLD->getChain();
4882 SDValue Ptr = MLD->getBasePtr();
4883 EVT MemoryVT = MLD->getMemoryVT();
4884 unsigned Alignment = MLD->getOriginalAlignment();
4885
4886 // if Alignment is equal to the vector size,
4887 // take the half of it for the second part
4888 unsigned SecondHalfAlignment =
4889 (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
4890 Alignment/2 : Alignment;
4891
4892 EVT LoMemVT, HiMemVT;
4893 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
4894
4895 MachineMemOperand *MMO = DAG.getMachineFunction().
4896 getMachineMemOperand(MLD->getPointerInfo(),
4897 MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
4898 Alignment, MLD->getAAInfo(), MLD->getRanges());
4899
4900 Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, MMO);
4901
4902 unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
4903 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
4904 DAG.getConstant(IncrementSize, Ptr.getValueType()));
4905
4906 MMO = DAG.getMachineFunction().
4907 getMachineMemOperand(MLD->getPointerInfo(),
4908 MachineMemOperand::MOLoad, HiMemVT.getStoreSize(),
4909 SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
4910
4911 Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, MMO);
4912
4913 AddToWorklist(Lo.getNode());
4914 AddToWorklist(Hi.getNode());
4915
4916 // Build a factor node to remember that this load is independent of the
4917 // other one.
4918 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
4919 Hi.getValue(1));
4920
4921 // Legalized the chain result - switch anything that used the old chain to
4922 // use the new one.
4923 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
4924
4925 SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
4926
4927 SDValue RetOps[] = { LoadRes, Chain };
4928 return DAG.getMergeValues(RetOps, DL);
4929 }
4930 return SDValue();
4931 }
4932
47734933 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
47744934 SDValue N0 = N->getOperand(0);
47754935 SDValue N1 = N->getOperand(1);
824824 case ISD::SINT_TO_FP: Res = PromoteIntOp_SINT_TO_FP(N); break;
825825 case ISD::STORE: Res = PromoteIntOp_STORE(cast(N),
826826 OpNo); break;
827 case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast(N),
828 OpNo); break;
829 case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N),
830 OpNo); break;
827831 case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break;
828832 case ISD::FP16_TO_FP:
829833 case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break;
10881092 // Truncate the value and store the result.
10891093 return DAG.getTruncStore(Ch, dl, Val, Ptr,
10901094 N->getMemoryVT(), N->getMemOperand());
1095 }
1096
1097 SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
1098
1099 assert(OpNo == 2 && "Only know how to promote the mask!");
1100 EVT DataVT = N->getOperand(3).getValueType();
1101 SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
1102 SmallVector NewOps(N->op_begin(), N->op_end());
1103 NewOps[OpNo] = Mask;
1104 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
1105 }
1106
1107 SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
1108 assert(OpNo == 2 && "Only know how to promote the mask!");
1109 EVT DataVT = N->getValueType(0);
1110 SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
1111 SmallVector NewOps(N->op_begin(), N->op_end());
1112 NewOps[OpNo] = Mask;
1113 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
10911114 }
10921115
10931116 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
284284 SDValue PromoteIntOp_TRUNCATE(SDNode *N);
285285 SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
286286 SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
287 SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
288 SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
287289
288290 void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
289291
577579 void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
578580 void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
579581 void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
582 void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi);
580583 void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
581584 void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
582585 void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
593596 SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
594597 SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
595598 SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
599 SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
596600 SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
597601 SDValue SplitVecOp_TRUNCATE(SDNode *N);
598602 SDValue SplitVecOp_VSETCC(SDNode *N);
596596 case ISD::LOAD:
597597 SplitVecRes_LOAD(cast(N), Lo, Hi);
598598 break;
599 case ISD::MLOAD:
600 SplitVecRes_MLOAD(cast(N), Lo, Hi);
601 break;
599602 case ISD::SETCC:
600603 SplitVecRes_SETCC(N, Lo, Hi);
601604 break;
976979 // Legalized the chain result - switch anything that used the old chain to
977980 // use the new one.
978981 ReplaceValueWith(SDValue(LD, 1), Ch);
982 }
983
984 void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
985 SDValue &Lo, SDValue &Hi) {
986 EVT LoVT, HiVT;
987 SDLoc dl(MLD);
988 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
989
990 SDValue Ch = MLD->getChain();
991 SDValue Ptr = MLD->getBasePtr();
992 SDValue Mask = MLD->getMask();
993 unsigned Alignment = MLD->getOriginalAlignment();
994
995 // if Alignment is equal to the vector size,
996 // take the half of it for the second part
997 unsigned SecondHalfAlignment =
998 (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
999 Alignment/2 : Alignment;
1000
1001 SDValue MaskLo, MaskHi;
1002 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
1003
1004 EVT MemoryVT = MLD->getMemoryVT();
1005 EVT LoMemVT, HiMemVT;
1006 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
1007
1008 SDValue Src0 = MLD->getSrc0();
1009 SDValue Src0Lo, Src0Hi;
1010 std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
1011
1012 MachineMemOperand *MMO = DAG.getMachineFunction().
1013 getMachineMemOperand(MLD->getPointerInfo(),
1014 MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
1015 Alignment, MLD->getAAInfo(), MLD->getRanges());
1016
1017 Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, MMO);
1018
1019 unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
1020 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
1021 DAG.getConstant(IncrementSize, Ptr.getValueType()));
1022
1023 MMO = DAG.getMachineFunction().
1024 getMachineMemOperand(MLD->getPointerInfo(),
1025 MachineMemOperand::MOLoad, HiMemVT.getStoreSize(),
1026 SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
1027
1028 Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, MMO);
1029
1030
1031 // Build a factor node to remember that this load is independent of the
1032 // other one.
1033 Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
1034 Hi.getValue(1));
1035
1036 // Legalized the chain result - switch anything that used the old chain to
1037 // use the new one.
1038 ReplaceValueWith(SDValue(MLD, 1), Ch);
1039
9791040 }
9801041
9811042 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
12331294 case ISD::STORE:
12341295 Res = SplitVecOp_STORE(cast(N), OpNo);
12351296 break;
1297 case ISD::MSTORE:
1298 Res = SplitVecOp_MSTORE(cast(N), OpNo);
1299 break;
12361300 case ISD::VSELECT:
12371301 Res = SplitVecOp_VSELECT(N, OpNo);
12381302 break;
13921456 StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
13931457 return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
13941458 MachinePointerInfo(), EltVT, false, false, false, 0);
1459 }
1460
1461 SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
1462 unsigned OpNo) {
1463 SDValue Ch = N->getChain();
1464 SDValue Ptr = N->getBasePtr();
1465 SDValue Mask = N->getMask();
1466 SDValue Data = N->getData();
1467 EVT MemoryVT = N->getMemoryVT();
1468 unsigned Alignment = N->getOriginalAlignment();
1469 SDLoc DL(N);
1470
1471 EVT LoMemVT, HiMemVT;
1472 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
1473
1474 SDValue DataLo, DataHi;
1475 GetSplitVector(Data, DataLo, DataHi);
1476 SDValue MaskLo, MaskHi;
1477 GetSplitVector(Mask, MaskLo, MaskHi);
1478
1479 // if Alignment is equal to the vector size,
1480 // take the half of it for the second part
1481 unsigned SecondHalfAlignment =
1482 (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
1483 Alignment/2 : Alignment;
1484
1485 SDValue Lo, Hi;
1486 MachineMemOperand *MMO = DAG.getMachineFunction().
1487 getMachineMemOperand(N->getPointerInfo(),
1488 MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
1489 Alignment, N->getAAInfo(), N->getRanges());
1490
1491 Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, MMO);
1492
1493 unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
1494 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1495 DAG.getConstant(IncrementSize, Ptr.getValueType()));
1496
1497 MMO = DAG.getMachineFunction().
1498 getMachineMemOperand(N->getPointerInfo(),
1499 MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
1500 SecondHalfAlignment, N->getAAInfo(), N->getRanges());
1501
1502 Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, MMO);
1503
1504
1505 // Build a factor node to remember that this store is independent of the
1506 // other one.
1507 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
1508
13951509 }
13961510
13971511 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
49164916 return SDValue(N, 0);
49174917 }
49184918
4919 SDValue
4920 SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
4921 SDValue Ptr, SDValue Mask, SDValue Src0,
4922 MachineMemOperand *MMO) {
4923
4924 SDVTList VTs = getVTList(VT, MVT::Other);
4925 SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
4926 FoldingSetNodeID ID;
4927 AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
4928 ID.AddInteger(VT.getRawBits());
4929 ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED,
4930 MMO->isVolatile(),
4931 MMO->isNonTemporal(),
4932 MMO->isInvariant()));
4933 ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
4934 void *IP = nullptr;
4935 if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
4936 cast(E)->refineAlignment(MMO);
4937 return SDValue(E, 0);
4938 }
4939 SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(),
4940 dl.getDebugLoc(), Ops, 4, VTs,
4941 VT, MMO);
4942 CSEMap.InsertNode(N, IP);
4943 InsertNode(N);
4944 return SDValue(N, 0);
4945 }
4946
4947 SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
4948 SDValue Ptr, SDValue Mask, MachineMemOperand *MMO) {
4949 assert(Chain.getValueType() == MVT::Other &&
4950 "Invalid chain type");
4951 EVT VT = Val.getValueType();
4952 SDVTList VTs = getVTList(MVT::Other);
4953 SDValue Ops[] = { Chain, Ptr, Mask, Val };
4954 FoldingSetNodeID ID;
4955 AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
4956 ID.AddInteger(VT.getRawBits());
4957 ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
4958 MMO->isNonTemporal(), MMO->isInvariant()));
4959 ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
4960 void *IP = nullptr;
4961 if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
4962 cast(E)->refineAlignment(MMO);
4963 return SDValue(E, 0);
4964 }
4965 SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(),
4966 dl.getDebugLoc(), Ops, 4,
4967 VTs, VT, MMO);
4968 CSEMap.InsertNode(N, IP);
4969 InsertNode(N);
4970 return SDValue(N, 0);
4971 }
4972
49194973 SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl,
49204974 SDValue Chain, SDValue Ptr,
49214975 SDValue SV,
36143614 SDValue StoreNode = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
36153615 makeArrayRef(Chains.data(), ChainI));
36163616 DAG.setRoot(StoreNode);
3617 }
3618
3619 void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
3620 SDLoc sdl = getCurSDLoc();
3621
3622 Value *PtrOperand = I.getArgOperand(0);
3623 SDValue Ptr = getValue(PtrOperand);
3624 SDValue Src0 = getValue(I.getArgOperand(1));
3625 SDValue Mask = getValue(I.getArgOperand(3));
3626 EVT VT = Src0.getValueType();
3627 unsigned Alignment = (cast(I.getArgOperand(2)))->getZExtValue();
3628 if (!Alignment)
3629 Alignment = DAG.getEVTAlignment(VT);
3630
3631 AAMDNodes AAInfo;
3632 I.getAAMetadata(AAInfo);
3633
3634 MachineMemOperand *MMO =
3635 DAG.getMachineFunction().
3636 getMachineMemOperand(MachinePointerInfo(PtrOperand),
3637 MachineMemOperand::MOStore, VT.getStoreSize(),
3638 Alignment, AAInfo);
3639 SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, MMO);
3640 DAG.setRoot(StoreNode);
3641 setValue(&I, StoreNode);
3642 }
3643
3644 void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
3645 SDLoc sdl = getCurSDLoc();
3646
3647 Value *PtrOperand = I.getArgOperand(0);
3648 SDValue Ptr = getValue(PtrOperand);
3649 SDValue Src0 = getValue(I.getArgOperand(1));
3650 SDValue Mask = getValue(I.getArgOperand(3));
3651
3652 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3653 EVT VT = TLI.getValueType(I.getType());
3654 unsigned Alignment = (cast(I.getArgOperand(2)))->getZExtValue();
3655 if (!Alignment)
3656 Alignment = DAG.getEVTAlignment(VT);
3657
3658 AAMDNodes AAInfo;
3659 I.getAAMetadata(AAInfo);
3660 const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
3661
3662 SDValue InChain = DAG.getRoot();
3663 if (AA->pointsToConstantMemory(
3664 AliasAnalysis::Location(PtrOperand,
3665 AA->getTypeStoreSize(I.getType()),
3666 AAInfo))) {
3667 // Do not serialize (non-volatile) loads of constant memory with anything.
3668 InChain = DAG.getEntryNode();
3669 }
3670
3671 MachineMemOperand *MMO =
3672 DAG.getMachineFunction().
3673 getMachineMemOperand(MachinePointerInfo(PtrOperand),
3674 MachineMemOperand::MOLoad, VT.getStoreSize(),
3675 Alignment, AAInfo, Ranges);
3676
3677 SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, MMO);
3678 SDValue OutChain = Load.getValue(1);
3679 DAG.setRoot(OutChain);
3680 setValue(&I, Load);
36173681 }
36183682
36193683 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
49174981 return nullptr;
49184982 }
49194983
4984 case Intrinsic::masked_load:
4985 visitMaskedLoad(I);
4986 return nullptr;
4987 case Intrinsic::masked_store:
4988 visitMaskedStore(I);
4989 return nullptr;
49204990 case Intrinsic::x86_mmx_pslli_w:
49214991 case Intrinsic::x86_mmx_pslli_d:
49224992 case Intrinsic::x86_mmx_pslli_q:
768768 void visitAlloca(const AllocaInst &I);
769769 void visitLoad(const LoadInst &I);
770770 void visitStore(const StoreInst &I);
771 void visitMaskedLoad(const CallInst &I);
772 void visitMaskedStore(const CallInst &I);
771773 void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
772774 void visitAtomicRMW(const AtomicRMWInst &I);
773775 void visitFence(const FenceInst &I);
268268 // Other operators
269269 case ISD::LOAD: return "load";
270270 case ISD::STORE: return "store";
271 case ISD::MLOAD: return "masked_load";
272 case ISD::MSTORE: return "masked_store";
271273 case ISD::VAARG: return "vaarg";
272274 case ISD::VACOPY: return "vacopy";
273275 case ISD::VAEND: return "vaend";
550550 IIT_ANYPTR = 26,
551551 IIT_V1 = 27,
552552 IIT_VARARG = 28,
553 IIT_HALF_VEC_ARG = 29
553 IIT_HALF_VEC_ARG = 29,
554 IIT_SAME_VEC_WIDTH_ARG = 30
554555 };
555556
556557
655656 case IIT_HALF_VEC_ARG: {
656657 unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
657658 OutputTable.push_back(IITDescriptor::get(IITDescriptor::HalfVecArgument,
659 ArgInfo));
660 return;
661 }
662 case IIT_SAME_VEC_WIDTH_ARG: {
663 unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
664 OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
658665 ArgInfo));
659666 return;
660667 }
765772 case IITDescriptor::HalfVecArgument:
766773 return VectorType::getHalfElementsVectorType(cast(
767774 Tys[D.getArgumentNumber()]));
768 }
775 case IITDescriptor::SameVecWidthArgument:
776 Type *EltTy = DecodeFixedType(Infos, Tys, Context);
777 Type *Ty = Tys[D.getArgumentNumber()];
778 if (VectorType *VTy = dyn_cast(Ty)) {
779 return VectorType::get(EltTy, VTy->getNumElements());
780 }
781 llvm_unreachable("unhandled");
782 }
769783 llvm_unreachable("unhandled");
770784 }
771785
182182 return createCallHelper(FnAssume, Ops, this);
183183 }
184184
185 /// Create a call to a Masked Load intrinsic.
186 /// Ops - an array of operands.
187 CallInst *IRBuilderBase::CreateMaskedLoad(ArrayRef Ops) {
188 // The only one overloaded type - the type of passthru value in this case
189 Type *DataTy = Ops[1]->getType();
190 return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy);
191 }
192
193 /// Create a call to a Masked Store intrinsic.
194 /// Ops - an array of operands.
195 CallInst *IRBuilderBase::CreateMaskedStore(ArrayRef Ops) {
196 // DataTy - type of the data to be stored - the only one overloaded type
197 Type *DataTy = Ops[1]->getType();
198 return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, DataTy);
199 }
200
201 /// Create a call to a Masked intrinsic, with given intrinsic Id,
202 /// an array of operands - Ops, and one overloaded type - DataTy
203 CallInst *IRBuilderBase::CreateMaskedIntrinsic(unsigned Id,
204 ArrayRef Ops,
205 Type *DataTy) {
206 Module *M = BB->getParent()->getParent();
207 Type *OverloadedTypes[] = { DataTy };
208 Value *TheFn = Intrinsic::getDeclaration(M, (Intrinsic::ID)Id, OverloadedTypes);
209 return createCallHelper(TheFn, Ops, this);
210 }
24052405 !isa(ArgTys[D.getArgumentNumber()]) ||
24062406 VectorType::getHalfElementsVectorType(
24072407 cast(ArgTys[D.getArgumentNumber()])) != Ty;
2408 case IITDescriptor::SameVecWidthArgument: {
2409 if (D.getArgumentNumber() >= ArgTys.size())
2410 return true;
2411 VectorType * ReferenceType =
2412 dyn_cast(ArgTys[D.getArgumentNumber()]);
2413 VectorType *ThisArgType = dyn_cast(Ty);
2414 if (!ThisArgType || !ReferenceType ||
2415 (ReferenceType->getVectorNumElements() !=
2416 ThisArgType->getVectorNumElements()))
2417 return true;
2418 return VerifyIntrinsicType(ThisArgType->getVectorElementType(),
2419 Infos, ArgTys);
2420 }
24082421 }
24092422 llvm_unreachable("unhandled");
24102423 }
13181318
13191319 // Extract subvector is special because the value type
13201320 // (result) is 128-bit but the source is 256-bit wide.
1321 if (VT.is128BitVector())
1321 if (VT.is128BitVector()) {
1322 if (VT.getScalarSizeInBits() >= 32) {
1323 setOperationAction(ISD::MLOAD, VT, Custom);
1324 setOperationAction(ISD::MSTORE, VT, Custom);
1325 }
13221326 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1323
1327 }
13241328 // Do not attempt to custom lower other non-256-bit vectors
13251329 if (!VT.is256BitVector())
13261330 continue;
13271331
1332 if (VT.getScalarSizeInBits() >= 32) {
1333 setOperationAction(ISD::MLOAD, VT, Legal);
1334 setOperationAction(ISD::MSTORE, VT, Legal);
1335 }
13281336 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
13291337 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
13301338 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
14911499 unsigned EltSize = VT.getVectorElementType().getSizeInBits();
14921500 // Extract subvector is special because the value type
14931501 // (result) is 256/128-bit but the source is 512-bit wide.
1494 if (VT.is128BitVector() || VT.is256BitVector())
1502 if (VT.is128BitVector() || VT.is256BitVector()) {
14951503 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1496
1504 if ( EltSize >= 32) {
1505 setOperationAction(ISD::MLOAD, VT, Legal);
1506 setOperationAction(ISD::MSTORE, VT, Legal);
1507 }
1508 }
14971509 if (VT.getVectorElementType() == MVT::i1)
14981510 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
14991511
15091521 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
15101522 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
15111523 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1524 setOperationAction(ISD::MLOAD, VT, Legal);
1525 setOperationAction(ISD::MSTORE, VT, Legal);
15121526 }
15131527 }
15141528 for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
21212121 (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
21222122 VR512:$src)>;
21232123
2124 def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)),
2125 (VMOVUPSZmrk addr:$ptr,
2126 (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
2127 (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
2128
2129 def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
2130 (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz
2131 (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
2132
2133 def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)),
2134 (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>;
2135
2136 def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)),
2137 (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>;
2138
2139 def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)),
2140 (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>;
2141
2142 def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask,
2143 (bc_v16f32 (v16i32 immAllZerosV)))),
2144 (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>;
2145
2146 def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))),
2147 (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>;
2148
2149 def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)),
2150 (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>;
2151
2152 def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask,
2153 (bc_v8f64 (v16i32 immAllZerosV)))),
2154 (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>;
2155
2156 def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))),
2157 (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>;
2158
21242159 defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32",
21252160 "16", "8", "4", SSEPackedInt, HasAVX512>,
21262161 avx512_store_vl<0x7F, "vmovdqa32", "alignedstore",
21942229 (v16i32 VR512:$src))),
21952230 (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>;
21962231 }
2232
2233 def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))),
2234 (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>;
2235
2236 def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)),
2237 (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>;
2238
2239 def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))),
2240 (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>;
2241
2242 def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask,
2243 (bc_v8i64 (v16i32 immAllZerosV)))),
2244 (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>;
2245
2246 def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)),
2247 (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>;
2248
2249 def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))),
2250 (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>;
2251
2252 def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)),
2253 (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>;
2254
2255 def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)),
2256 (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>;
2257
2258 // SKX replacement
2259 def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
2260 (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>;
2261
2262 // KNL replacement
2263 def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)),
2264 (VMOVDQU32Zmrk addr:$ptr,
2265 (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)),
2266 (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>;
2267
2268 def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)),
2269 (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz
2270 (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>;
2271
21972272
21982273 // Move Int Doubleword to Packed Double Int
21992274 //
92599259 int_x86_avx2_maskstore_q,
92609260 int_x86_avx2_maskstore_q_256>, VEX_W;
92619261
9262 def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)),
9263 (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
9264
9265 def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)),
9266 (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>;
9267
9268 def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
9269 (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
9270
9271 def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask),
9272 (bc_v8f32 (v8i32 immAllZerosV)))),
9273 (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
9274
9275 def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))),
9276 (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
9277 VR256:$mask)>;
9278
9279 def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)),
9280 (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
9281
9282 def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))),
9283 (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>;
9284
9285 def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))),
9286 (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr),
9287 VR256:$mask)>;
9288
9289 def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)),
9290 (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
9291
9292 def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)),
9293 (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>;
9294
9295 def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
9296 (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
9297
9298 def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
9299 (v4f64 immAllZerosV))),
9300 (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
9301
9302 def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))),
9303 (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
9304 VR256:$mask)>;
9305
9306 def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)),
9307 (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
9308
9309 def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask),
9310 (bc_v4i64 (v8i32 immAllZerosV)))),
9311 (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>;
9312
9313 def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))),
9314 (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr),
9315 VR256:$mask)>;
9316
92629317
92639318 //===----------------------------------------------------------------------===//
92649319 // Variable Bit Shifts
110110 Type *Ty) const override;
111111 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
112112 Type *Ty) const override;
113 bool isLegalPredicatedLoad (Type *DataType, int Consecutive) const override;
114 bool isLegalPredicatedStore(Type *DataType, int Consecutive) const override;
113115
114116 /// @}
115117 };
11551157 }
11561158 return X86TTI::getIntImmCost(Imm, Ty);
11571159 }
1160
1161 bool X86TTI::isLegalPredicatedLoad(Type *DataType, int Consecutive) const {
1162 int ScalarWidth = DataType->getScalarSizeInBits();
1163
1164 // Todo: AVX512 allows gather/scatter, works with strided and random as well
1165 if ((ScalarWidth < 32) || (Consecutive == 0))
1166 return false;
1167 if (ST->hasAVX512() || ST->hasAVX2())
1168 return true;
1169 return false;
1170 }
1171
1172 bool X86TTI::isLegalPredicatedStore(Type *DataType, int Consecutive) const {
1173 return isLegalPredicatedLoad(DataType, Consecutive);
1174 }
1175
0 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
1 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
2
3 ; AVX512-LABEL: test1
4 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
5
6 ; AVX2-LABEL: test1
7 ; AVX2: vpmaskmovd 32(%rdi)
8 ; AVX2: vpmaskmovd (%rdi)
9 ; AVX2-NOT: blend
10
11 define <16 x i32> @test1(<16 x i32> %trigger, i8* %addr) {
12 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
13 %res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>undef, i32 4, <16 x i1>%mask)
14 ret <16 x i32> %res
15 }
16
17 ; AVX512-LABEL: test2
18 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
19
20 ; AVX2-LABEL: test2
21 ; AVX2: vpmaskmovd {{.*}}(%rdi)
22 ; AVX2: vpmaskmovd {{.*}}(%rdi)
23 ; AVX2-NOT: blend
24 define <16 x i32> @test2(<16 x i32> %trigger, i8* %addr) {
25 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
26 %res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>zeroinitializer, i32 4, <16 x i1>%mask)
27 ret <16 x i32> %res
28 }
29
30 ; AVX512-LABEL: test3
31 ; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1}
32
33 define void @test3(<16 x i32> %trigger, i8* %addr, <16 x i32> %val) {
34 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
35 call void @llvm.masked.store.v16i32(i8* %addr, <16 x i32>%val, i32 4, <16 x i1>%mask)
36 ret void
37 }
38
39 ; AVX512-LABEL: test4
40 ; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}}
41
42 ; AVX2-LABEL: test4
43 ; AVX2: vpmaskmovd {{.*}}(%rdi)
44 ; AVX2: vpmaskmovd {{.*}}(%rdi)
45 ; AVX2: blend
46 define <16 x float> @test4(<16 x i32> %trigger, i8* %addr, <16 x float> %dst) {
47 %mask = icmp eq <16 x i32> %trigger, zeroinitializer
48 %res = call <16 x float> @llvm.masked.load.v16f32(i8* %addr, <16 x float>%dst, i32 4, <16 x i1>%mask)
49 ret <16 x float> %res
50 }
51
52 ; AVX512-LABEL: test5
53 ; AVX512: vmovupd (%rdi), %zmm1 {%k1}
54
55 ; AVX2-LABEL: test5
56 ; AVX2: vpmaskmovq
57 ; AVX2: vblendvpd
58 ; AVX2: vpmaskmovq
59 ; AVX2: vblendvpd
60 define <8 x double> @test5(<8 x i32> %trigger, i8* %addr, <8 x double> %dst) {
61 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
62 %res = call <8 x double> @llvm.masked.load.v8f64(i8* %addr, <8 x double>%dst, i32 4, <8 x i1>%mask)
63 ret <8 x double> %res
64 }
65
66 declare <16 x i32> @llvm.masked.load.v16i32(i8*, <16 x i32>, i32, <16 x i1>)
67 declare void @llvm.masked.store.v16i32(i8*, <16 x i32>, i32, <16 x i1>)
68 declare <16 x float> @llvm.masked.load.v16f32(i8*, <16 x float>, i32, <16 x i1>)
69 declare void @llvm.masked.store.v16f32(i8*, <16 x float>, i32, <16 x i1>)
70 declare <8 x double> @llvm.masked.load.v8f64(i8*, <8 x double>, i32, <8 x i1>)
71 declare void @llvm.masked.store.v8f64(i8*, <8 x double>, i32, <8 x i1>)
72
533533 // variants with iAny types; otherwise, if the intrinsic is not
534534 // overloaded, all the types can be specified directly.
535535 assert(((!TyEl->isSubClassOf("LLVMExtendedType") &&
536 !TyEl->isSubClassOf("LLVMTruncatedType")) ||
536 !TyEl->isSubClassOf("LLVMTruncatedType") &&
537 !TyEl->isSubClassOf("LLVMVectorSameWidth")) ||
537538 VT == MVT::iAny || VT == MVT::vAny) &&
538539 "Expected iAny or vAny type");
539540 } else
256256 IIT_ANYPTR = 26,
257257 IIT_V1 = 27,
258258 IIT_VARARG = 28,
259 IIT_HALF_VEC_ARG = 29
259 IIT_HALF_VEC_ARG = 29,
260 IIT_SAME_VEC_WIDTH_ARG = 30
260261 };
261262
262263
304305 Sig.push_back(IIT_TRUNC_ARG);
305306 else if (R->isSubClassOf("LLVMHalfElementsVectorType"))
306307 Sig.push_back(IIT_HALF_VEC_ARG);
308 else if (R->isSubClassOf("LLVMVectorSameWidth")) {
309 Sig.push_back(IIT_SAME_VEC_WIDTH_ARG);
310 Sig.push_back((Number << 2) | ArgCodes[Number]);
311 MVT::SimpleValueType VT = getValueType(R->getValueAsDef("ElTy"));
312 EncodeFixedValueType(VT, Sig);
313 return;
314 }
307315 else
308316 Sig.push_back(IIT_ARG);
309317 return Sig.push_back((Number << 2) | ArgCodes[Number]);