llvm.org GIT mirror llvm / 56a8a27
Type legalizer for masked gather and scatter intrinsics. Full type legalizer that works with all vectors length - from 2 to 16, (i32, i64, float, double). This intrinsic, for example void @llvm.masked.scatter.v2f32(<2 x float>%data , <2 x float*>%ptrs , i32 align , <2 x i1>%mask ) requires type widening for data and type promotion for mask. Differential Revision: http://reviews.llvm.org/D13633 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@255629 91177308-0d34-0410-b5e6-96231b3b80d8 Elena Demikhovsky 4 years ago
9 changed file(s) with 2092 addition(s) and 228 deletion(s). Raw diff Collapse all Expand all
21212121 : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, Operands, VTs, MemVT,
21222122 MMO) {
21232123 assert(getValue().getValueType() == getValueType(0) &&
2124 "Incompatible type of the PathThru value in MaskedGatherSDNode");
2124 "Incompatible type of the PassThru value in MaskedGatherSDNode");
21252125 assert(getMask().getValueType().getVectorNumElements() ==
2126 getValueType(0).getVectorNumElements() &&
2126 getValueType(0).getVectorNumElements() &&
21272127 "Vector width mismatch between mask and data");
2128 assert(getMask().getValueType().getScalarType() == MVT::i1 &&
2129 "Vector width mismatch between mask and data");
2128 assert(getIndex().getValueType().getVectorNumElements() ==
2129 getValueType(0).getVectorNumElements() &&
2130 "Vector width mismatch between index and data");
21302131 }
21312132
21322133 static bool classof(const SDNode *N) {
21422143 friend class SelectionDAG;
21432144 MaskedScatterSDNode(unsigned Order, DebugLoc dl,ArrayRef Operands,
21442145 SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
2145 : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs,
2146 MemVT, MMO) {
2146 : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, Operands, VTs, MemVT,
2147 MMO) {
21472148 assert(getMask().getValueType().getVectorNumElements() ==
2148 getValue().getValueType().getVectorNumElements() &&
2149 getValue().getValueType().getVectorNumElements() &&
21492150 "Vector width mismatch between mask and data");
2150 assert(getMask().getValueType().getScalarType() == MVT::i1 &&
2151 "Vector width mismatch between mask and data");
2151 assert(getIndex().getValueType().getVectorNumElements() ==
2152 getValue().getValueType().getVectorNumElements() &&
2153 "Vector width mismatch between index and data");
21522154 }
21532155
21542156 static bool classof(const SDNode *N) {
6565 case ISD::CTTZ: Res = PromoteIntRes_CTTZ(N); break;
6666 case ISD::EXTRACT_VECTOR_ELT:
6767 Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
68 case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N));break;
69 case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N));break;
68 case ISD::LOAD: Res = PromoteIntRes_LOAD(cast(N)); break;
69 case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast(N));
70 break;
71 case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast(N));
72 break;
7073 case ISD::SELECT: Res = PromoteIntRes_SELECT(N); break;
7174 case ISD::VSELECT: Res = PromoteIntRes_VSELECT(N); break;
7275 case ISD::SELECT_CC: Res = PromoteIntRes_SELECT_CC(N); break;
180183 N->getChain(), N->getBasePtr(),
181184 N->getMemOperand(), N->getOrdering(),
182185 N->getSynchScope());
183 // Legalized the chain result - switch anything that used the old chain to
186 // Legalize the chain result - switch anything that used the old chain to
184187 // use the new one.
185188 ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
186189 return Res;
193196 N->getChain(), N->getBasePtr(),
194197 Op2, N->getMemOperand(), N->getOrdering(),
195198 N->getSynchScope());
196 // Legalized the chain result - switch anything that used the old chain to
199 // Legalize the chain result - switch anything that used the old chain to
197200 // use the new one.
198201 ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
199202 return Res;
478481 SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
479482 N->getMemoryVT(), N->getMemOperand());
480483
481 // Legalized the chain result - switch anything that used the old chain to
484 // Legalize the chain result - switch anything that used the old chain to
482485 // use the new one.
483486 ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
484487 return Res;
488491 EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
489492 SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
490493
491 SDValue Mask = N->getMask();
492 EVT NewMaskVT = getSetCCResultType(NVT);
493 if (NewMaskVT != N->getMask().getValueType())
494 Mask = PromoteTargetBoolean(Mask, NewMaskVT);
495 SDLoc dl(N);
496
494 SDLoc dl(N);
497495 SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
498 Mask, ExtSrc0, N->getMemoryVT(),
496 N->getMask(), ExtSrc0, N->getMemoryVT(),
499497 N->getMemOperand(), ISD::SEXTLOAD);
500 // Legalized the chain result - switch anything that used the old chain to
498 // Legalize the chain result - switch anything that used the old chain to
501499 // use the new one.
502500 ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
503501 return Res;
504502 }
503
504 SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
505 EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
506 SDValue ExtSrc0 = GetPromotedInteger(N->getValue());
507 assert(NVT == ExtSrc0.getValueType() &&
508 "Gather result type and the passThru agrument type should be the same");
509
510 SDLoc dl(N);
511 SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(),
512 N->getIndex()};
513 SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
514 N->getMemoryVT(), dl, Ops,
515 N->getMemOperand());
516 // Legalize the chain result - switch anything that used the old chain to
517 // use the new one.
518 ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
519 return Res;
520 }
521
505522 /// Promote the overflow flag of an overflowing arithmetic node.
506523 SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
507524 // Simply change the return type of the boolean result.
888905 OpNo); break;
889906 case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast(N),
890907 OpNo); break;
908 case ISD::MGATHER: Res = PromoteIntOp_MGATHER(cast(N),
909 OpNo); break;
910 case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast(N),
911 OpNo); break;
891912 case ISD::TRUNCATE: Res = PromoteIntOp_TRUNCATE(N); break;
892913 case ISD::FP16_TO_FP:
893914 case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break;
11561177 N->getMemoryVT(), N->getMemOperand());
11571178 }
11581179
1159 SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
1180 SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
1181 unsigned OpNo) {
11601182
11611183 SDValue DataOp = N->getValue();
11621184 EVT DataVT = DataOp.getValueType();
11631185 SDValue Mask = N->getMask();
1164 EVT MaskVT = Mask.getValueType();
11651186 SDLoc dl(N);
11661187
11671188 bool TruncateStore = false;
1168 if (!TLI.isTypeLegal(DataVT)) {
1169 if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) {
1170 DataOp = GetPromotedInteger(DataOp);
1171 if (!TLI.isTypeLegal(MaskVT))
1172 Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
1173 TruncateStore = true;
1174 }
1189 if (OpNo == 2) {
1190 // Mask comes before the data operand. If the data operand is legal, we just
1191 // promote the mask.
1192 // When the data operand has illegal type, we should legalize the data
1193 // operand first. The mask will be promoted/splitted/widened according to
1194 // the data operand type.
1195 if (TLI.isTypeLegal(DataVT))
1196 Mask = PromoteTargetBoolean(Mask, DataVT);
11751197 else {
1176 assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector &&
1177 "Unexpected data legalization in MSTORE");
1178 DataOp = GetWidenedVector(DataOp);
1179
1180 if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
1181 Mask = GetWidenedVector(Mask);
1198 if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger)
1199 return PromoteIntOp_MSTORE(N, 3);
1200
1201 else if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector)
1202 return WidenVecOp_MSTORE(N, 3);
1203
11821204 else {
1183 EVT BoolVT = getSetCCResultType(DataOp.getValueType());
1184
1185 // We can't use ModifyToType() because we should fill the mask with
1186 // zeroes
1187 unsigned WidenNumElts = BoolVT.getVectorNumElements();
1188 unsigned MaskNumElts = MaskVT.getVectorNumElements();
1189
1190 unsigned NumConcat = WidenNumElts / MaskNumElts;
1191 SmallVector Ops(NumConcat);
1192 SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT);
1193 Ops[0] = Mask;
1194 for (unsigned i = 1; i != NumConcat; ++i)
1195 Ops[i] = ZeroVal;
1196
1197 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
1205 assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector);
1206 return SplitVecOp_MSTORE(N, 3);
11981207 }
11991208 }
1200 }
1201 else
1202 Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType());
1209 } else { // Data operand
1210 assert(OpNo == 3 && "Unexpected operand for promotion");
1211 DataOp = GetPromotedInteger(DataOp);
1212 Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
1213 TruncateStore = true;
1214 }
1215
12031216 return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
12041217 N->getMemoryVT(), N->getMemOperand(),
12051218 TruncateStore);
12061219 }
12071220
1208 SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
1221 SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
1222 unsigned OpNo) {
12091223 assert(OpNo == 2 && "Only know how to promote the mask!");
12101224 EVT DataVT = N->getValueType(0);
12111225 SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
12121226 SmallVector NewOps(N->op_begin(), N->op_end());
12131227 NewOps[OpNo] = Mask;
1228 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
1229 }
1230
1231 SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
1232 unsigned OpNo) {
1233
1234 SmallVector NewOps(N->op_begin(), N->op_end());
1235 if (OpNo == 2) {
1236 // The Mask
1237 EVT DataVT = N->getValueType(0);
1238 NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
1239 } else
1240 NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
1241 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
1242 }
1243
1244 SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
1245 unsigned OpNo) {
1246 SmallVector NewOps(N->op_begin(), N->op_end());
1247 if (OpNo == 2) {
1248 // The Mask
1249 EVT DataVT = N->getValue().getValueType();
1250 NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
1251 } else
1252 NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
12141253 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
12151254 }
12161255
20702109 }
20712110 }
20722111
2073 // Legalized the chain result - switch anything that used the old chain to
2112 // Legalize the chain result - switch anything that used the old chain to
20742113 // use the new one.
20752114 ReplaceValueWith(SDValue(N, 1), Ch);
20762115 }
11261126 return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
11271127 }
11281128
1129 /// WidenTargetBoolean - Widen the given target boolean to a target boolean
1130 /// of the given type. The boolean vector is widened and then promoted to match
1131 /// the target boolean type of the given ValVT.
1132 SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT,
1133 bool WithZeroes) {
1134 SDLoc dl(Bool);
1135 EVT BoolVT = Bool.getValueType();
1136
1137 assert(ValVT.getVectorNumElements() > BoolVT.getVectorNumElements() &&
1138 TLI.isTypeLegal(ValVT) &&
1139 "Unexpected types in WidenTargetBoolean");
1140 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), BoolVT.getScalarType(),
1141 ValVT.getVectorNumElements());
1142 Bool = ModifyToType(Bool, WideVT, WithZeroes);
1143 return PromoteTargetBoolean(Bool, ValVT);
1144 }
1145
11291146 /// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT
11301147 /// bits in Hi.
11311148 void DAGTypeLegalizer::SplitInteger(SDValue Op,
186186 std::pair ExpandAtomic(SDNode *Node);
187187
188188 SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT);
189
190 /// Modify Bit Vector to match SetCC result type of ValVT.
191 /// The bit vector is widened with zeroes when WithZeroes is true.
192 SDValue WidenTargetBoolean(SDValue Bool, EVT ValVT, bool WithZeroes = false);
193
189194 void ReplaceValueWith(SDValue From, SDValue To);
190195 void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
191196 void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
260265 SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
261266 SDValue PromoteIntRes_LOAD(LoadSDNode *N);
262267 SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
268 SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
263269 SDValue PromoteIntRes_Overflow(SDNode *N);
264270 SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
265271 SDValue PromoteIntRes_SDIV(SDNode *N);
306312 SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
307313 SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
308314 SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
315 SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
316 SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
309317
310318 void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
311319
709717 SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
710718 SDValue WidenVecRes_LOAD(SDNode* N);
711719 SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
720 SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
712721 SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
713722 SDValue WidenVecRes_SELECT(SDNode* N);
714723 SDValue WidenVecRes_SELECT_CC(SDNode* N);
736745 SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
737746 SDValue WidenVecOp_STORE(SDNode* N);
738747 SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
748 SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo);
739749 SDValue WidenVecOp_SETCC(SDNode* N);
740750
741751 SDValue WidenVecOp_Convert(SDNode *N);
775785
776786 /// Modifies a vector input (widen or narrows) to a vector of NVT. The
777787 /// input vector must have the same element type as NVT.
778 SDValue ModifyToType(SDValue InOp, EVT WidenVT);
779
788 /// When FillWithZeroes is "on" the vector will be widened with
789 /// zeroes.
790 /// By default, the vector will be widened with undefined values.
791 SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
780792
781793 //===--------------------------------------------------------------------===//
782794 // Generic Splitting: LegalizeTypesGeneric.cpp
234234 N->isInvariant(), N->getOriginalAlignment(),
235235 N->getAAInfo());
236236
237 // Legalized the chain result - switch anything that used the old chain to
237 // Legalize the chain result - switch anything that used the old chain to
238238 // use the new one.
239239 ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
240240 return Result;
10191019 Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
10201020 Hi.getValue(1));
10211021
1022 // Legalized the chain result - switch anything that used the old chain to
1022 // Legalize the chain result - switch anything that used the old chain to
10231023 // use the new one.
10241024 ReplaceValueWith(SDValue(LD, 1), Ch);
10251025 }
10331033 SDValue Ch = MLD->getChain();
10341034 SDValue Ptr = MLD->getBasePtr();
10351035 SDValue Mask = MLD->getMask();
1036 SDValue Src0 = MLD->getSrc0();
10361037 unsigned Alignment = MLD->getOriginalAlignment();
10371038 ISD::LoadExtType ExtType = MLD->getExtensionType();
10381039
10421043 (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
10431044 Alignment/2 : Alignment;
10441045
1046 // Split Mask operand
10451047 SDValue MaskLo, MaskHi;
1046 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
1048 if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
1049 GetSplitVector(Mask, MaskLo, MaskHi);
1050 else
1051 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
10471052
10481053 EVT MemoryVT = MLD->getMemoryVT();
10491054 EVT LoMemVT, HiMemVT;
10501055 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
10511056
1052 SDValue Src0 = MLD->getSrc0();
10531057 SDValue Src0Lo, Src0Hi;
1054 std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
1058 if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
1059 GetSplitVector(Src0, Src0Lo, Src0Hi);
1060 else
1061 std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
10551062
10561063 MachineMemOperand *MMO = DAG.getMachineFunction().
10571064 getMachineMemOperand(MLD->getPointerInfo(),
10791086 Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
10801087 Hi.getValue(1));
10811088
1082 // Legalized the chain result - switch anything that used the old chain to
1089 // Legalize the chain result - switch anything that used the old chain to
10831090 // use the new one.
10841091 ReplaceValueWith(SDValue(MLD, 1), Ch);
10851092
10941101 SDValue Ch = MGT->getChain();
10951102 SDValue Ptr = MGT->getBasePtr();
10961103 SDValue Mask = MGT->getMask();
1104 SDValue Src0 = MGT->getValue();
1105 SDValue Index = MGT->getIndex();
10971106 unsigned Alignment = MGT->getOriginalAlignment();
10981107
1108 // Split Mask operand
10991109 SDValue MaskLo, MaskHi;
1100 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
1110 if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
1111 GetSplitVector(Mask, MaskLo, MaskHi);
1112 else
1113 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
11011114
11021115 EVT MemoryVT = MGT->getMemoryVT();
11031116 EVT LoMemVT, HiMemVT;
1117 // Split MemoryVT
11041118 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
11051119
11061120 SDValue Src0Lo, Src0Hi;
1107 std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl);
1121 if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
1122 GetSplitVector(Src0, Src0Lo, Src0Hi);
1123 else
1124 std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
11081125
11091126 SDValue IndexHi, IndexLo;
1110 std::tie(IndexLo, IndexHi) = DAG.SplitVector(MGT->getIndex(), dl);
1127 if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
1128 GetSplitVector(Index, IndexLo, IndexHi);
1129 else
1130 std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
11111131
11121132 MachineMemOperand *MMO = DAG.getMachineFunction().
11131133 getMachineMemOperand(MGT->getPointerInfo(),
11271147 Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
11281148 Hi.getValue(1));
11291149
1130 // Legalized the chain result - switch anything that used the old chain to
1150 // Legalize the chain result - switch anything that used the old chain to
11311151 // use the new one.
11321152 ReplaceValueWith(SDValue(MGT, 1), Ch);
11331153 }
15981618 SDValue Ptr = MGT->getBasePtr();
15991619 SDValue Index = MGT->getIndex();
16001620 SDValue Mask = MGT->getMask();
1621 SDValue Src0 = MGT->getValue();
16011622 unsigned Alignment = MGT->getOriginalAlignment();
16021623
16031624 SDValue MaskLo, MaskHi;
1604 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
1625 if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
1626 // Split Mask operand
1627 GetSplitVector(Mask, MaskLo, MaskHi);
1628 else
1629 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
16051630
16061631 EVT MemoryVT = MGT->getMemoryVT();
16071632 EVT LoMemVT, HiMemVT;
16081633 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
16091634
16101635 SDValue Src0Lo, Src0Hi;
1611 std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(MGT->getValue(), dl);
1636 if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
1637 GetSplitVector(Src0, Src0Lo, Src0Hi);
1638 else
1639 std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
16121640
16131641 SDValue IndexHi, IndexLo;
1614 if (Index.getNode())
1642 if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
1643 GetSplitVector(Index, IndexLo, IndexHi);
1644 else
16151645 std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
1616 else
1617 IndexLo = IndexHi = Index;
16181646
16191647 MachineMemOperand *MMO = DAG.getMachineFunction().
16201648 getMachineMemOperand(MGT->getPointerInfo(),
16401668 Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
16411669 Hi.getValue(1));
16421670
1643 // Legalized the chain result - switch anything that used the old chain to
1671 // Legalize the chain result - switch anything that used the old chain to
16441672 // use the new one.
16451673 ReplaceValueWith(SDValue(MGT, 1), Ch);
16461674
16641692 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
16651693
16661694 SDValue DataLo, DataHi;
1667 GetSplitVector(Data, DataLo, DataHi);
1695 if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
1696 // Split Data operand
1697 GetSplitVector(Data, DataLo, DataHi);
1698 else
1699 std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
1700
16681701 SDValue MaskLo, MaskHi;
1669 GetSplitVector(Mask, MaskLo, MaskHi);
1702 if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
1703 // Split Mask operand
1704 GetSplitVector(Mask, MaskLo, MaskHi);
1705 else
1706 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
1707
1708 MaskLo = PromoteTargetBoolean(MaskLo, DataLo.getValueType());
1709 MaskHi = PromoteTargetBoolean(MaskHi, DataHi.getValueType());
16701710
16711711 // if Alignment is equal to the vector size,
16721712 // take the half of it for the second part
17111751 unsigned Alignment = N->getOriginalAlignment();
17121752 SDLoc DL(N);
17131753
1754 // Split all operands
17141755 EVT LoMemVT, HiMemVT;
17151756 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
17161757
17171758 SDValue DataLo, DataHi;
1718 GetSplitVector(Data, DataLo, DataHi);
1759 if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
1760 // Split Data operand
1761 GetSplitVector(Data, DataLo, DataHi);
1762 else
1763 std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
1764
17191765 SDValue MaskLo, MaskHi;
1720 GetSplitVector(Mask, MaskLo, MaskHi);
1721
1722 SDValue PtrLo, PtrHi;
1723 if (Ptr.getValueType().isVector()) // gather form vector of pointers
1724 std::tie(PtrLo, PtrHi) = DAG.SplitVector(Ptr, DL);
1766 if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
1767 // Split Mask operand
1768 GetSplitVector(Mask, MaskLo, MaskHi);
17251769 else
1726 PtrLo = PtrHi = Ptr;
1770 std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
17271771
17281772 SDValue IndexHi, IndexLo;
1729 if (Index.getNode())
1773 if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
1774 GetSplitVector(Index, IndexLo, IndexHi);
1775 else
17301776 std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
1731 else
1732 IndexLo = IndexHi = Index;
17331777
17341778 SDValue Lo, Hi;
17351779 MachineMemOperand *MMO = DAG.getMachineFunction().
17371781 MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
17381782 Alignment, N->getAAInfo(), N->getRanges());
17391783
1740 SDValue OpsLo[] = {Ch, DataLo, MaskLo, PtrLo, IndexLo};
1784 SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo};
17411785 Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
17421786 DL, OpsLo, MMO);
17431787
17461790 MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
17471791 Alignment, N->getAAInfo(), N->getRanges());
17481792
1749 SDValue OpsHi[] = {Ch, DataHi, MaskHi, PtrHi, IndexHi};
1793 SDValue OpsHi[] = {Ch, DataHi, MaskHi, Ptr, IndexHi};
17501794 Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
17511795 DL, OpsHi, MMO);
17521796
19742018 case ISD::MLOAD:
19752019 Res = WidenVecRes_MLOAD(cast(N));
19762020 break;
2021 case ISD::MGATHER:
2022 Res = WidenVecRes_MGATHER(cast(N));
2023 break;
19772024
19782025 case ISD::ADD:
19792026 case ISD::AND:
27272774 SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
27282775 Mask, Src0, N->getMemoryVT(),
27292776 N->getMemOperand(), ExtType);
2730 // Legalized the chain result - switch anything that used the old chain to
2777 // Legalize the chain result - switch anything that used the old chain to
2778 // use the new one.
2779 ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
2780 return Res;
2781 }
2782
2783 SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
2784
2785 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
2786 SDValue Mask = N->getMask();
2787 SDValue Src0 = GetWidenedVector(N->getValue());
2788 unsigned NumElts = WideVT.getVectorNumElements();
2789 SDLoc dl(N);
2790
2791 // The mask should be widened as well
2792 Mask = WidenTargetBoolean(Mask, WideVT, true);
2793
2794 // Widen the Index operand
2795 SDValue Index = N->getIndex();
2796 EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
2797 Index.getValueType().getScalarType(),
2798 NumElts);
2799 Index = ModifyToType(Index, WideIndexVT);
2800 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
2801 SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
2802 N->getMemoryVT(), dl, Ops,
2803 N->getMemOperand());
2804
2805 // Legalize the chain result - switch anything that used the old chain to
27312806 // use the new one.
27322807 ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
27332808 return Res;
28892964 case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
28902965 case ISD::STORE: Res = WidenVecOp_STORE(N); break;
28912966 case ISD::MSTORE: Res = WidenVecOp_MSTORE(N, OpNo); break;
2967 case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break;
28922968 case ISD::SETCC: Res = WidenVecOp_SETCC(N); break;
28932969 case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break;
28942970
31343210 return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(),
31353211 Mask, MST->getMemoryVT(), MST->getMemOperand(),
31363212 false);
3213 }
3214
3215 SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
3216 assert(OpNo == 1 && "Can widen only data operand of mscatter");
3217 MaskedScatterSDNode *MSC = cast(N);
3218 SDValue DataOp = MSC->getValue();
3219 SDValue Mask = MSC->getMask();
3220
3221 // Widen the value
3222 SDValue WideVal = GetWidenedVector(DataOp);
3223 EVT WideVT = WideVal.getValueType();
3224 unsigned NumElts = WideVal.getValueType().getVectorNumElements();
3225 SDLoc dl(N);
3226
3227 // The mask should be widened as well
3228 Mask = WidenTargetBoolean(Mask, WideVT, true);
3229
3230 // Widen index
3231 SDValue Index = MSC->getIndex();
3232 EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
3233 Index.getValueType().getScalarType(),
3234 NumElts);
3235 Index = ModifyToType(Index, WideIndexVT);
3236
3237 SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index};
3238 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
3239 MSC->getMemoryVT(), dl, Ops,
3240 MSC->getMemOperand());
31373241 }
31383242
31393243 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
35993703
36003704 /// Modifies a vector input (widen or narrows) to a vector of NVT. The
36013705 /// input vector must have the same element type as NVT.
3602 SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
3706 /// FillWithZeroes specifies that the vector should be widened with zeroes.
3707 SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
3708 bool FillWithZeroes) {
36033709 // Note that InOp might have been widened so it might already have
36043710 // the right width or it might need be narrowed.
36053711 EVT InVT = InOp.getValueType();
36163722 if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
36173723 unsigned NumConcat = WidenNumElts / InNumElts;
36183724 SmallVector Ops(NumConcat);
3619 SDValue UndefVal = DAG.getUNDEF(InVT);
3725 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
3726 DAG.getUNDEF(InVT);
36203727 Ops[0] = InOp;
36213728 for (unsigned i = 1; i != NumConcat; ++i)
3622 Ops[i] = UndefVal;
3729 Ops[i] = FillVal;
36233730
36243731 return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
36253732 }
36393746 ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
36403747 DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
36413748
3642 SDValue UndefVal = DAG.getUNDEF(EltVT);
3749 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
3750 DAG.getUNDEF(EltVT);
36433751 for ( ; Idx < WidenNumElts; ++Idx)
3644 Ops[Idx] = UndefVal;
3752 Ops[Idx] = FillVal;
36453753 return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
36463754 }
15781578 setOperationAction(ISD::OR, VT, Legal);
15791579 setOperationAction(ISD::XOR, VT, Legal);
15801580 }
1581 if (EltSize >= 32 && VT.getSizeInBits() <= 512) {
1581 if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) {
15821582 setOperationAction(ISD::MGATHER, VT, Custom);
15831583 setOperationAction(ISD::MSCATTER, VT, Custom);
15841584 }
16041604 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
16051605 setOperationAction(ISD::MLOAD, VT, Legal);
16061606 setOperationAction(ISD::MSTORE, VT, Legal);
1607 setOperationAction(ISD::MGATHER, VT, Legal);
1608 setOperationAction(ISD::MSCATTER, VT, Custom);
16071609 }
16081610 }
16091611 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
18121814 setTargetDAGCombine(ISD::BUILD_VECTOR);
18131815 setTargetDAGCombine(ISD::MUL);
18141816 setTargetDAGCombine(ISD::XOR);
1817 setTargetDAGCombine(ISD::MSCATTER);
1818 setTargetDAGCombine(ISD::MGATHER);
18151819
18161820 computeRegisterProperties(Subtarget->getRegisterInfo());
18171821
1975919763 EVT EltVT = NVT.getVectorElementType();
1976019764
1976119765 SDLoc dl(InOp);
19766 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
19767 InOp.getNumOperands() == 2) {
19768 SDValue N1 = InOp.getOperand(1);
19769 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
19770 N1.isUndef()) {
19771 InOp = InOp.getOperand(0);
19772 InVT = InOp.getSimpleValueType();
19773 InNumElts = InVT.getVectorNumElements();
19774 }
19775 }
1976219776 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
1976319777 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
1976419778 SmallVector Ops;
1978219796 assert(Subtarget->hasAVX512() &&
1978319797 "MGATHER/MSCATTER are supported on AVX-512 arch only");
1978419798
19799 // X86 scatter kills mask register, so its type should be added to
19800 // the list of return values.
19801 // If the "scatter" has 2 return values, it is already handled.
19802 if (Op.getNode()->getNumValues() == 2)
19803 return Op;
19804
1978519805 MaskedScatterSDNode *N = cast(Op.getNode());
19786 MVT VT = N->getValue().getSimpleValueType();
19806 SDValue Src = N->getValue();
19807 MVT VT = Src.getSimpleValueType();
1978719808 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
1978819809 SDLoc dl(Op);
1978919810
19790 // X86 scatter kills mask register, so its type should be added to
19791 // the list of return values
19792 if (N->getNumValues() == 1) {
19793 SDValue Index = N->getIndex();
19794 if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
19795 !Index.getSimpleValueType().is512BitVector())
19811 SDValue NewScatter;
19812 SDValue Index = N->getIndex();
19813 SDValue Mask = N->getMask();
19814 SDValue Chain = N->getChain();
19815 SDValue BasePtr = N->getBasePtr();
19816 MVT MemVT = N->getMemoryVT().getSimpleVT();
19817 MVT IndexVT = Index.getSimpleValueType();
19818 MVT MaskVT = Mask.getSimpleValueType();
19819
19820 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
19821 // The v2i32 value was promoted to v2i64.
19822 // Now we "redo" the type legalizer's work and widen the original
19823 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
19824 // with a shuffle.
19825 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
19826 "Unexpected memory type");
19827 int ShuffleMask[] = {0, 2, -1, -1};
19828 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
19829 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
19830 // Now we have 4 elements instead of 2.
19831 // Expand the index.
19832 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
19833 Index = ExtendToType(Index, NewIndexVT, DAG);
19834
19835 // Expand the mask with zeroes
19836 // Mask may be <2 x i64> or <2 x i1> at this moment
19837 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
19838 "Unexpected mask type");
19839 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
19840 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
19841 VT = MVT::v4i32;
19842 }
19843
19844 unsigned NumElts = VT.getVectorNumElements();
19845 if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
19846 !Index.getSimpleValueType().is512BitVector()) {
19847 // AVX512F supports only 512-bit vectors. Or data or index should
19848 // be 512 bit wide. If now the both index and data are 256-bit, but
19849 // the vector contains 8 elements, we just sign-extend the index
19850 if (IndexVT == MVT::v8i32)
19851 // Just extend index
1979619852 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
19797
19798 SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other);
19799 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
19800 N->getOperand(3), Index };
19801
19802 SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand());
19803 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
19804 return SDValue(NewScatter.getNode(), 0);
19805 }
19806 return Op;
19853 else {
19854 // The minimal number of elts in scatter is 8
19855 NumElts = 8;
19856 // Index
19857 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
19858 // Use original index here, do not modify the index twice
19859 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
19860 if (IndexVT.getScalarType() == MVT::i32)
19861 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
19862
19863 // Mask
19864 // At this point we have promoted mask operand
19865 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
19866 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
19867 // Use the original mask here, do not modify the mask twice
19868 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
19869
19870 // The value that should be stored
19871 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
19872 Src = ExtendToType(Src, NewVT, DAG);
19873 }
19874 }
19875 // If the mask is "wide" at this point - truncate it to i1 vector
19876 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
19877 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
19878
19879 // The mask is killed by scatter, add it to the values
19880 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
19881 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
19882 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
19883 N->getMemOperand());
19884 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
19885 return SDValue(NewScatter.getNode(), 0);
1980719886 }
1980819887
1980919888 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget,
1986819947 "MGATHER/MSCATTER are supported on AVX-512 arch only");
1986919948
1987019949 MaskedGatherSDNode *N = cast(Op.getNode());
19950 SDLoc dl(Op);
1987119951 MVT VT = Op.getSimpleValueType();
19952 SDValue Index = N->getIndex();
19953 SDValue Mask = N->getMask();
19954 SDValue Src0 = N->getValue();
19955 MVT IndexVT = Index.getSimpleValueType();
19956 MVT MaskVT = Mask.getSimpleValueType();
19957
19958 unsigned NumElts = VT.getVectorNumElements();
1987219959 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
19873 SDLoc dl(Op);
19874
19875 SDValue Index = N->getIndex();
19960
1987619961 if (!Subtarget->hasVLX() && !VT.is512BitVector() &&
1987719962 !Index.getSimpleValueType().is512BitVector()) {
19878 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
19879 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
19880 N->getOperand(3), Index };
19881 DAG.UpdateNodeOperands(N, Ops);
19963 // AVX512F supports only 512-bit vectors. Or data or index should
19964 // be 512 bit wide. If now the both index and data are 256-bit, but
19965 // the vector contains 8 elements, we just sign-extend the index
19966 if (NumElts == 8) {
19967 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
19968 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
19969 N->getOperand(3), Index };
19970 DAG.UpdateNodeOperands(N, Ops);
19971 return Op;
19972 }
19973
19974 // Minimal number of elements in Gather
19975 NumElts = 8;
19976 // Index
19977 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
19978 Index = ExtendToType(Index, NewIndexVT, DAG);
19979 if (IndexVT.getScalarType() == MVT::i32)
19980 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
19981
19982 // Mask
19983 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
19984 // At this point we have promoted mask operand
19985 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
19986 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
19987 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
19988 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
19989
19990 // The pass-thru value
19991 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
19992 Src0 = ExtendToType(Src0, NewVT, DAG);
19993
19994 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
19995 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
19996 N->getMemoryVT(), dl, Ops,
19997 N->getMemOperand());
19998 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
19999 NewGather.getValue(0),
20000 DAG.getIntPtrConstant(0, dl));
20001 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
20002 return DAG.getMergeValues(RetOps, dl);
1988220003 }
1988320004 return Op;
1988420005 }
2690627027 return SDValue();
2690727028 }
2690827029
27030 static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) {
27031 SDLoc DL(N);
27032 // Gather and Scatter instructions use k-registers for masks. The type of
27033 // the masks is v*i1. So the mask will be truncated anyway.
27034 // The SIGN_EXTEND_INREG my be dropped.
27035 SDValue Mask = N->getOperand(2);
27036 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
27037 SmallVector NewOps(N->op_begin(), N->op_end());
27038 NewOps[2] = Mask.getOperand(0);
27039 DAG.UpdateNodeOperands(N, NewOps);
27040 }
27041 return SDValue();
27042 }
27043
2690927044 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
2691027045 // as "sbb reg,reg", since it can be extended without zext and produces
2691127046 // an all-ones bit which is more useful than 0/1 in some cases.
2734727482 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
2734827483 case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
2734927484 case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG);
27485 case ISD::MGATHER:
27486 case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG);
2735027487 }
2735127488
2735227489 return SDValue();
21752175 (EXTRACT_SUBREG
21762176 (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
21772177 sub_16bit)>;
2178 def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
2179 (COPY_TO_REGCLASS VK1:$src, VK16)>;
2180 def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
2181 (COPY_TO_REGCLASS VK1:$src, VK8)>;
2182 }
2183 let Predicates = [HasBWI] in {
2184 def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
2185 (COPY_TO_REGCLASS VK1:$src, VK32)>;
2186 def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
2187 (COPY_TO_REGCLASS VK1:$src, VK64)>;
2188 }
2178 }
2179 def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
2180 (COPY_TO_REGCLASS VK1:$src, VK16)>;
2181 def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
2182 (COPY_TO_REGCLASS VK1:$src, VK8)>;
2183 def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
2184 (COPY_TO_REGCLASS VK1:$src, VK4)>;
2185 def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
2186 (COPY_TO_REGCLASS VK1:$src, VK2)>;
2187 def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
2188 (COPY_TO_REGCLASS VK1:$src, VK32)>;
2189 def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
2190 (COPY_TO_REGCLASS VK1:$src, VK64)>;
21892191
21902192
21912193 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
24882490 def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))),
24892491 (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>;
24902492
2493 def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
2494 (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
2495
24912496 def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))),
24922497 (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>;
24932498
24962501
24972502 def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
24982503 (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>;
2504
24992505 def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))),
25002506 (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>;
25012507
None ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=KNL
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64
2 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32
3 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX
4 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32
15 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
26
37
48 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
59 target triple = "x86_64-unknown-linux-gnu"
610
7 ; KNL-LABEL: test1
8 ; KNL: kxnorw %k1, %k1, %k1
9 ; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1011
1112 ; SCALAR-LABEL: test1
12 ; SCALAR: extractelement <16 x float*>
13 ; SCALAR: extractelement <16 x float*>
1314 ; SCALAR-NEXT: load float
1415 ; SCALAR-NEXT: insertelement <16 x float>
1516 ; SCALAR-NEXT: extractelement <16 x float*>
1617 ; SCALAR-NEXT: load float
1718
1819 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
20 ; KNL_64-LABEL: test1:
21 ; KNL_64: # BB#0:
22 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1
23 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
24 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
25 ; KNL_64-NEXT: retq
26 ;
27 ; KNL_32-LABEL: test1:
28 ; KNL_32: # BB#0:
29 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
30 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1
31 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
32 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
33 ; KNL_32-NEXT: retl
34 ;
35 ; SKX-LABEL: test1:
36 ; SKX: # BB#0:
37 ; SKX-NEXT: kxnorw %k1, %k1, %k1
38 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
39 ; SKX-NEXT: vmovaps %zmm1, %zmm0
40 ; SKX-NEXT: retq
1941
2042 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
2143 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
2244
2345 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2446 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
25
47
2648 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> , <16 x float> undef)
2749 ret <16 x float>%res
2850 }
3052 declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
3153 declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
3254 declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
33
34 ; KNL-LABEL: test2
35 ; KNL: kmovw %esi, %k1
36 ; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
55
3756
3857 ; SCALAR-LABEL: test2
39 ; SCALAR: extractelement <16 x float*>
58 ; SCALAR: extractelement <16 x float*>
4059 ; SCALAR-NEXT: load float
4160 ; SCALAR-NEXT: insertelement <16 x float>
4261 ; SCALAR-NEXT: br label %else
4362 ; SCALAR: else:
44 ; SCALAR-NEXT: %res.phi.else = phi
63 ; SCALAR-NEXT: %res.phi.else = phi
4564 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
4665 ; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
4766 ; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
4867
4968 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
69 ; KNL_64-LABEL: test2:
70 ; KNL_64: # BB#0:
71 ; KNL_64-NEXT: kmovw %esi, %k1
72 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
73 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
74 ; KNL_64-NEXT: retq
75 ;
76 ; KNL_32-LABEL: test2:
77 ; KNL_32: # BB#0:
78 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
79 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
80 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
81 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
82 ; KNL_32-NEXT: retl
83 ;
84 ; SKX-LABEL: test2:
85 ; SKX: # BB#0:
86 ; SKX-NEXT: kmovw %esi, %k1
87 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
88 ; SKX-NEXT: vmovaps %zmm1, %zmm0
89 ; SKX-NEXT: retq
5090
5191 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
5292 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
5898 ret <16 x float> %res
5999 }
60100
61 ; KNL-LABEL: test3
62 ; KNL: kmovw %esi, %k1
63 ; KNL: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
64101 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
102 ; KNL_64-LABEL: test3:
103 ; KNL_64: # BB#0:
104 ; KNL_64-NEXT: kmovw %esi, %k1
105 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
106 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
107 ; KNL_64-NEXT: retq
108 ;
109 ; KNL_32-LABEL: test3:
110 ; KNL_32: # BB#0:
111 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
112 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
113 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
114 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
115 ; KNL_32-NEXT: retl
116 ;
117 ; SKX-LABEL: test3:
118 ; SKX: # BB#0:
119 ; SKX-NEXT: kmovw %esi, %k1
120 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
121 ; SKX-NEXT: vmovaps %zmm1, %zmm0
122 ; SKX-NEXT: retq
65123
66124 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
67125 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
73131 ret <16 x i32> %res
74132 }
75133
76 ; KNL-LABEL: test4
77 ; KNL: kmovw %esi, %k1
78 ; KNL: kmovw
79 ; KNL: vpgatherdd
80 ; KNL: vpgatherdd
81134
82135 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
136 ; KNL_64-LABEL: test4:
137 ; KNL_64: # BB#0:
138 ; KNL_64-NEXT: kmovw %esi, %k1
139 ; KNL_64-NEXT: kmovw %k1, %k2
140 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
141 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2
142 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
143 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
144 ; KNL_64-NEXT: retq
145 ;
146 ; KNL_32-LABEL: test4:
147 ; KNL_32: # BB#0:
148 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
149 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
150 ; KNL_32-NEXT: kmovw %k1, %k2
151 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
152 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
153 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
154 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
155 ; KNL_32-NEXT: retl
156 ;
157 ; SKX-LABEL: test4:
158 ; SKX: # BB#0:
159 ; SKX-NEXT: kmovw %esi, %k1
160 ; SKX-NEXT: kmovw %k1, %k2
161 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
162 ; SKX-NEXT: vmovaps %zmm1, %zmm2
163 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
164 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
165 ; SKX-NEXT: retq
83166
84167 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
85168 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
92175 ret <16 x i32> %res
93176 }
94177
95 ; KNL-LABEL: test5
96 ; KNL: kmovw %k1, %k2
97 ; KNL: vpscatterdd {{.*}}%k2
98 ; KNL: vpscatterdd {{.*}}%k1
99178
100179 ; SCALAR-LABEL: test5
101180 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
112191 ; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
113192
114193 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
194 ; KNL_64-LABEL: test5:
195 ; KNL_64: # BB#0:
196 ; KNL_64-NEXT: kmovw %esi, %k1
197 ; KNL_64-NEXT: kmovw %k1, %k2
198 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
199 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
200 ; KNL_64-NEXT: retq
201 ;
202 ; KNL_32-LABEL: test5:
203 ; KNL_32: # BB#0:
204 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
205 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
206 ; KNL_32-NEXT: kmovw %k1, %k2
207 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
208 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
209 ; KNL_32-NEXT: retl
210 ;
211 ; SKX-LABEL: test5:
212 ; SKX: # BB#0:
213 ; SKX-NEXT: kmovw %esi, %k1
214 ; SKX-NEXT: kmovw %k1, %k2
215 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
216 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
217 ; SKX-NEXT: retq
115218
116219 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
117220 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
126229 declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
127230 declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
128231
129 ; KNL-LABEL: test6
130 ; KNL: kxnorw %k1, %k1, %k1
131 ; KNL: kxnorw %k2, %k2, %k2
132 ; KNL: vpgatherqd (,%zmm{{.*}}), %ymm{{.*}} {%k2}
133 ; KNL: vpscatterqd %ymm{{.*}}, (,%zmm{{.*}}) {%k1}
134232
135233 ; SCALAR-LABEL: test6
136234 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
142240 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
143241
144242 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
243 ; KNL_64-LABEL: test6:
244 ; KNL_64: # BB#0:
245 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1
246 ; KNL_64-NEXT: kxnorw %k2, %k2, %k2
247 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
248 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
249 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
250 ; KNL_64-NEXT: retq
251 ;
252 ; KNL_32-LABEL: test6:
253 ; KNL_32: # BB#0:
254 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1
255 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
256 ; KNL_32-NEXT: kxnorw %k2, %k2, %k2
257 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
258 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
259 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
260 ; KNL_32-NEXT: retl
261 ;
262 ; SKX-LABEL: test6:
263 ; SKX: # BB#0:
264 ; SKX-NEXT: kxnorw %k1, %k1, %k1
265 ; SKX-NEXT: kxnorw %k2, %k2, %k2
266 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
267 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
268 ; SKX-NEXT: vmovaps %zmm2, %zmm0
269 ; SKX-NEXT: retq
145270
146271 %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> , <8 x i32> undef)
147272
149274 ret <8 x i32>%a
150275 }
151276
152 ; In this case the index should be promoted to <8 x i64> for KNL
153 ; KNL-LABEL: test7
154 ; KNL: vpmovsxdq %ymm0, %zmm0
155 ; KNL: kmovw %k1, %k2
156 ; KNL: vpgatherqd {{.*}} {%k2}
157 ; KNL: vpgatherqd {{.*}} {%k1}
158277 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
278 ;
279 ; KNL_64-LABEL: test7:
280 ; KNL_64: # BB#0:
281 ; KNL_64-NEXT: movzbl %sil, %eax
282 ; KNL_64-NEXT: kmovw %eax, %k1
283 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
284 ; KNL_64-NEXT: kmovw %k1, %k2
285 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
286 ; KNL_64-NEXT: vmovaps %zmm1, %zmm2
287 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
288 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
289 ; KNL_64-NEXT: retq
290 ;
291 ; KNL_32-LABEL: test7:
292 ; KNL_32: # BB#0:
293 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
294 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
295 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
296 ; KNL_32-NEXT: kmovw %k1, %k2
297 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
298 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
299 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
300 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
301 ; KNL_32-NEXT: retl
302 ;
303 ; SKX-LABEL: test7:
304 ; SKX: # BB#0:
305 ; SKX-NEXT: kmovb %esi, %k1
306 ; SKX-NEXT: kmovw %k1, %k2
307 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
308 ; SKX-NEXT: vmovaps %zmm1, %zmm2
309 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
310 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
311 ; SKX-NEXT: retq
159312
160313 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
161314 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
170323
171324 ; No uniform base in this case, index <8 x i64> contains addresses,
172325 ; each gather call will be split into two
173 ; KNL-LABEL: test8
174 ; KNL: kshiftrw $8, %k1, %k2
175 ; KNL: vpgatherqd
176 ; KNL: vpgatherqd
177 ; KNL: vinserti64x4
178 ; KNL: vpgatherqd
179 ; KNL: vpgatherqd
180 ; KNL: vinserti64x4
181326 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
327 ; KNL_64-LABEL: test8:
328 ; KNL_64: # BB#0:
329 ; KNL_64-NEXT: kmovw %edi, %k1
330 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
331 ; KNL_64-NEXT: kmovw %k2, %k3
332 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
333 ; KNL_64-NEXT: kmovw %k1, %k3
334 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
335 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
336 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
337 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
338 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
339 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
340 ; KNL_64-NEXT: retq
341 ;
342 ; KNL_32-LABEL: test8:
343 ; KNL_32: # BB#0:
344 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
345 ; KNL_32-NEXT: kmovw %k1, %k2
346 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
347 ; KNL_32-NEXT: vmovaps %zmm1, %zmm2
348 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
349 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
350 ; KNL_32-NEXT: retl
351 ;
352 ; SKX-LABEL: test8:
353 ; SKX: # BB#0:
354 ; SKX-NEXT: kmovw %edi, %k1
355 ; SKX-NEXT: kshiftrw $8, %k1, %k2
356 ; SKX-NEXT: kmovw %k2, %k3
357 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
358 ; SKX-NEXT: kmovw %k1, %k3
359 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
360 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
361 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
362 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
363 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
364 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
365 ; SKX-NEXT: retq
366 ;
367 ; SKX_32-LABEL: test8:
368 ; SKX_32: # BB#0:
369 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
370 ; SKX_32-NEXT: kmovw %k1, %k2
371 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
372 ; SKX_32-NEXT: vmovaps %zmm1, %zmm2
373 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
374 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
375 ; SKX_32-NEXT: retl
376
182377 %imask = bitcast i16 %mask to <16 x i1>
183378 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
184379 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
192387 ; Masked gather for agregate types
193388 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
194389
195 ; KNL-LABEL: test9
196 ; KNL: vpbroadcastq %rdi, %zmm
197 ; KNL: vpmovsxdq
198 ; KNL: vpbroadcastq
199 ; KNL: vpmuludq
200 ; KNL: vpaddq
201 ; KNL: vpaddq
202 ; KNL: vpaddq
203 ; KNL: vpaddq
204 ; KNL: vpgatherqd (,%zmm
205390
206391 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
392 ; KNL_64-LABEL: test9:
393 ; KNL_64: # BB#0: # %entry
394 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
395 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
396 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
397 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
398 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
399 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
400 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
401 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
402 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
403 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
404 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
405 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
406 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
407 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
408 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
409 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
410 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
411 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1
412 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
413 ; KNL_64-NEXT: retq
414 ;
415 ; KNL_32-LABEL: test9:
416 ; KNL_32: # BB#0: # %entry
417 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
418 ; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3
419 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
420 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
421 ; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3
422 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
423 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
424 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
425 ; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1
426 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
427 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
428 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1
429 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
430 ; KNL_32-NEXT: retl
431 ;
432 ; SKX-LABEL: test9:
433 ; SKX: # BB#0: # %entry
434 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2
435 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
436 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
437 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
438 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
439 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
440 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
441 ; SKX-NEXT: kxnorw %k1, %k1, %k1
442 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
443 ; SKX-NEXT: retq
207444 entry:
208445 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
209446 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
213450 ret <8 x i32> %res
214451 }
215452
216 ; KNL-LABEL: test10
217 ; KNL: vpbroadcastq %rdi, %zmm
218 ; KNL: vpmovsxdq
219 ; KNL: vpbroadcastq
220 ; KNL: vpmuludq
221 ; KNL: vpaddq
222 ; KNL: vpaddq
223 ; KNL: vpaddq
224 ; KNL: vpaddq
225 ; KNL: vpgatherqd (,%zmm
226453 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
454 ; KNL_64-LABEL: test10:
455 ; KNL_64: # BB#0: # %entry
456 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
457 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
458 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
459 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4
460 ; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1
461 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
462 ; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1
463 ; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1
464 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
465 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
466 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
467 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
468 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
469 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
470 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
471 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
472 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
473 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1
474 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
475 ; KNL_64-NEXT: retq
476 ;
477 ; KNL_32-LABEL: test10:
478 ; KNL_32: # BB#0: # %entry
479 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
480 ; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3
481 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
482 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
483 ; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3
484 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
485 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
486 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
487 ; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1
488 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
489 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
490 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1
491 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
492 ; KNL_32-NEXT: retl
493 ;
494 ; SKX-LABEL: test10:
495 ; SKX: # BB#0: # %entry
496 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2
497 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
498 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
499 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
500 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
501 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
502 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
503 ; SKX-NEXT: kxnorw %k1, %k1, %k1
504 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
505 ; SKX-NEXT: retq
227506 entry:
228507 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
229508 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
234513 }
235514
236515 ; Splat index in GEP, requires broadcast
237 ; KNL-LABEL: test11
238 ; KNL: vpbroadcastd %esi, %zmm
239 ; KNL: vgatherdps (%rdi,%zmm
240516 define <16 x float> @test11(float* %base, i32 %ind) {
517 ; KNL_64-LABEL: test11:
518 ; KNL_64: # BB#0:
519 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
520 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1
521 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
522 ; KNL_64-NEXT: retq
523 ;
524 ; KNL_32-LABEL: test11:
525 ; KNL_32: # BB#0:
526 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
527 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1
528 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1
529 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
530 ; KNL_32-NEXT: retl
531 ;
532 ; SKX-LABEL: test11:
533 ; SKX: # BB#0:
534 ; SKX-NEXT: vpbroadcastd %esi, %zmm1
535 ; SKX-NEXT: kxnorw %k1, %k1, %k1
536 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
537 ; SKX-NEXT: retq
241538
242539 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
243540 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
249546 }
250547
251548 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
252 ; KNL-LABEL: test12
253 ; KNL: kxnorw %k1, %k1, %k1
254 ; KNL: vgatherdps (%rdi,%zmm
255549 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
550 ; KNL_64-LABEL: test12:
551 ; KNL_64: # BB#0:
552 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1
553 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
554 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
555 ; KNL_64-NEXT: retq
556 ;
557 ; KNL_32-LABEL: test12:
558 ; KNL_32: # BB#0:
559 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
560 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1
561 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
562 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
563 ; KNL_32-NEXT: retl
564 ;
565 ; SKX-LABEL: test12:
566 ; SKX: # BB#0:
567 ; SKX-NEXT: kxnorw %k1, %k1, %k1
568 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
569 ; SKX-NEXT: vmovaps %zmm1, %zmm0
570 ; SKX-NEXT: retq
256571
257572 %sext_ind = sext <16 x i32> %ind to <16 x i64>
258573 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
262577 }
263578
264579 ; The same as the previous, but the mask is undefined
265 ; KNL-LABEL: test13
266 ; KNL-NOT: kxnorw
267 ; KNL: vgatherdps (%rdi,%zmm
268580 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
581 ; KNL_64-LABEL: test13:
582 ; KNL_64: # BB#0:
583 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
584 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
585 ; KNL_64-NEXT: retq
586 ;
587 ; KNL_32-LABEL: test13:
588 ; KNL_32: # BB#0:
589 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
590 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
591 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
592 ; KNL_32-NEXT: retl
593 ;
594 ; SKX-LABEL: test13:
595 ; SKX: # BB#0:
596 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
597 ; SKX-NEXT: vmovaps %zmm1, %zmm0
598 ; SKX-NEXT: retq
269599
270600 %sext_ind = sext <16 x i32> %ind to <16 x i64>
271601 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
275605 }
276606
277607 ; The base pointer is not splat, can't find unform base
278 ; KNL-LABEL: test14
279 ; KNL: vgatherqps (,%zmm0)
280 ; KNL: vgatherqps (,%zmm0)
281608 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
609 ; KNL_64-LABEL: test14:
610 ; KNL_64: # BB#0:
611 ; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
612 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
613 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
614 ; KNL_64-NEXT: vmovd %esi, %xmm1
615 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
616 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
617 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
618 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
619 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
620 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
621 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
622 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
623 ; KNL_64-NEXT: retq
624 ;
625 ; KNL_32-LABEL: test14:
626 ; KNL_32: # BB#0:
627 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
628 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
629 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
630 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
631 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
632 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
633 ; KNL_32-NEXT: retl
634 ;
635 ; SKX-LABEL: test14:
636 ; SKX: # BB#0:
637 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
638 ; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
639 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
640 ; SKX-NEXT: vmovd %esi, %xmm1
641 ; SKX-NEXT: vpbroadcastd %xmm1, %ymm1
642 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
643 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
644 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
645 ; SKX-NEXT: kshiftrw $8, %k0, %k1
646 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
647 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
648 ; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
649 ; SKX-NEXT: retq
650 ;
651 ; SKX_32-LABEL: test14:
652 ; SKX_32: # BB#0:
653 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
654 ; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
655 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
656 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
657 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
658 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
659 ; SKX_32-NEXT: retl
282660
283661 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
284662 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
289667 ret <16 x float>%res
290668 }
291669
292
293 ; KNL-LABEL: test15
294 ; KNL: kmovw %eax, %k1
295 ; KNL: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
296
297 ; SCALAR-LABEL: test15
298 ; SCALAR: extractelement <16 x float*>
670 declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
671 declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
672 declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
673
674 ; Gather smaller than existing instruction
675 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
676 ;
677 ; KNL_64-LABEL: test15:
678 ; KNL_64: # BB#0:
679 ; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
680 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
681 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
682 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0
683 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
684 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1
685 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
686 ; KNL_64-NEXT: retq
687 ;
688 ; KNL_32-LABEL: test15:
689 ; KNL_32: # BB#0:
690 ; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
691 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
692 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
693 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
694 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0
695 ; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0
696 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
697 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
698 ; KNL_32-NEXT: retl
699 ;
700 ; SKX-LABEL: test15:
701 ; SKX: # BB#0:
702 ; SKX-NEXT: vpmovd2m %xmm1, %k1
703 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
704 ; SKX-NEXT: vmovaps %zmm1, %zmm0
705 ; SKX-NEXT: retq
706
707 %sext_ind = sext <4 x i32> %ind to <4 x i64>
708 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
709 %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
710 ret <4 x float>%res
711 }
712
713 ; Gather smaller than existing instruction
714 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
715 ;
716 ; KNL_64-LABEL: test16:
717 ; KNL_64: # BB#0:
718 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
719 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
720 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
721 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
722 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
723 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
724 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
725 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
726 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
727 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
728 ; KNL_64-NEXT: retq
729 ;
730 ; KNL_32-LABEL: test16:
731 ; KNL_32: # BB#0:
732 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
733 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
734 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
735 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
736 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
737 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
738 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
739 ; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1
740 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
741 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
742 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
743 ; KNL_32-NEXT: retl
744 ;
745 ; SKX-LABEL: test16:
746 ; SKX: # BB#0:
747 ; SKX-NEXT: vpmovd2m %xmm1, %k1
748 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
749 ; SKX-NEXT: vmovaps %zmm2, %zmm0
750 ; SKX-NEXT: retq
751
752 %sext_ind = sext <4 x i32> %ind to <4 x i64>
753 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
754 %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
755 ret <4 x double>%res
756 }
757
758 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
759 ;
760 ; KNL_64-LABEL: test17:
761 ; KNL_64: # BB#0:
762 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
763 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
764 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
765 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
766 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
767 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
768 ; KNL_64-NEXT: retq
769 ;
770 ; KNL_32-LABEL: test17:
771 ; KNL_32: # BB#0:
772 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
773 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
774 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
775 ; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1
776 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
777 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
778 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
779 ; KNL_32-NEXT: retl
780 ;
781 ; SKX-LABEL: test17:
782 ; SKX: # BB#0:
783 ; SKX-NEXT: vpmovq2m %xmm1, %k1
784 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
785 ; SKX-NEXT: vmovaps %zmm2, %zmm0
786 ; SKX-NEXT: retq
787
788 %sext_ind = sext <2 x i32> %ind to <2 x i64>
789 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
790 %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
791 ret <2 x double>%res
792 }
793
794 declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
795 declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
796 declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
797 declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
798 declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
799
800 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
801 ;
802 ; KNL_64-LABEL: test18:
803 ; KNL_64: # BB#0:
804 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
805 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
806 ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
807 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
808 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
809 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
810 ; KNL_64-NEXT: retq
811 ;
812 ; KNL_32-LABEL: test18:
813 ; KNL_32: # BB#0:
814 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
815 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
816 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
817 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
818 ; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2
819 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
820 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
821 ; KNL_32-NEXT: retl
822 ;
823 ; SKX-LABEL: test18:
824 ; SKX: # BB#0:
825 ; SKX-NEXT: vpmovd2m %xmm2, %k1
826 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
827 ; SKX-NEXT: retq
828 call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
829 ret void
830 }
831
832 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
833 ;
834 ; KNL_64-LABEL: test19:
835 ; KNL_64: # BB#0:
836 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
837 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
838 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
839 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
840 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
841 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
842 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
843 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
844 ; KNL_64-NEXT: retq
845 ;
846 ; KNL_32-LABEL: test19:
847 ; KNL_32: # BB#0:
848 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
849 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
850 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
851 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
852 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
853 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
854 ; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1
855 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
856 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
857 ; KNL_32-NEXT: retl
858 ;
859 ; SKX-LABEL: test19:
860 ; SKX: # BB#0:
861 ; SKX-NEXT: vpmovd2m %xmm1, %k1
862 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
863 ; SKX-NEXT: retq
864 ;
865 ; SKX_32-LABEL: test19:
866 ; SKX_32: # BB#0:
867 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
868 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
869 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
870 ; SKX_32-NEXT: retl
871 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
872 call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
873 ret void
874 }
875
876 ; Data type requires widening
877 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
878 ;
879 ; KNL_64-LABEL: test20:
880 ; KNL_64: # BB#0:
881 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
882 ; KNL_64-NEXT: vmovq %xmm2, %xmm2
883 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
884 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
885 ; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2
886 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
887 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
888 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
889 ; KNL_64-NEXT: retq
890 ;
891 ; KNL_32-LABEL: test20:
892 ; KNL_32: # BB#0:
893 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
894 ; KNL_32-NEXT: vmovq %xmm2, %xmm2
895 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
896 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
897 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
898 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
899 ; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2
900 ; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2
901 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
902 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
903 ; KNL_32-NEXT: retl
904 ;
905 ; SKX-LABEL: test20:
906 ; SKX: # BB#0:
907 ; SKX-NEXT: vpmovq2m %xmm2, %k0
908 ; SKX-NEXT: kshiftlw $2, %k0, %k0
909 ; SKX-NEXT: kshiftrw $2, %k0, %k1
910 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
911 ; SKX-NEXT: retq
912 call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
913 ret void
914 }
915
916 ; Data type requires promotion
917 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
918 ;
919 ; KNL_64-LABEL: test21:
920 ; KNL_64: # BB#0:
921 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
922 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
923 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
924 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
925 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
926 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
927 ; KNL_64-NEXT: retq
928 ;
929 ; KNL_32-LABEL: test21:
930 ; KNL_32: # BB#0:
931 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
932 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
933 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
934 ; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2
935 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
936 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
937 ; KNL_32-NEXT: retl
938 ;
939 ; SKX-LABEL: test21:
940 ; SKX: # BB#0:
941 ; SKX-NEXT: vpmovq2m %xmm2, %k0
942 ; SKX-NEXT: kshiftlw $2, %k0, %k0
943 ; SKX-NEXT: kshiftrw $2, %k0, %k1
944 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
945 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
946 ; SKX-NEXT: retq
947 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
948 ret void
949 }
950
951 ; The result type requires widening
952 declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
953
954 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
955 ;
956 ;
957 ; KNL_64-LABEL: test22:
958 ; KNL_64: # BB#0:
959 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
960 ; KNL_64-NEXT: vmovq %xmm1, %xmm1
961 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
962 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
963 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
964 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
965 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
966 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
967 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
968 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
969 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
970 ; KNL_64-NEXT: retq
971 ;
972 ; KNL_32-LABEL: test22:
973 ; KNL_32: # BB#0:
974 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
975 ; KNL_32-NEXT: vmovq %xmm1, %xmm1
976 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
977 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
978 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
979 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
980 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
981 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
982 ; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1
983 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
984 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
985 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
986 ; KNL_32-NEXT: retl
987 ;
988 ; SKX-LABEL: test22:
989 ; SKX: # BB#0:
990 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
991 ; SKX-NEXT: vpmovq2m %xmm1, %k0
992 ; SKX-NEXT: kshiftlw $2, %k0, %k0
993 ; SKX-NEXT: kshiftrw $2, %k0, %k1
994 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
995 ; SKX-NEXT: vmovaps %zmm2, %zmm0
996 ; SKX-NEXT: retq
997 %sext_ind = sext <2 x i32> %ind to <2 x i64>
998 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
999 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1000 ret <2 x float>%res
1001 }
1002
1003 declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1004 declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1005
1006 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1007 ;
1008 ; KNL_64-LABEL: test23:
1009 ; KNL_64: # BB#0:
1010 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1011 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1012 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
1013 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1014 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1015 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1016 ; KNL_64-NEXT: retq
1017 ;
1018 ; KNL_32-LABEL: test23:
1019 ; KNL_32: # BB#0:
1020 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1021 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1022 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1023 ; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1
1024 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1025 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1026 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1027 ; KNL_32-NEXT: retl
1028 ;
1029 ; SKX-LABEL: test23:
1030 ; SKX: # BB#0:
1031 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1032 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1033 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1034 ; SKX-NEXT: retq
1035 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1036 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1037 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1038 ret <2 x i32>%res
1039 }
1040
1041 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1042 ;
1043 ;
1044 ; KNL_64-LABEL: test24:
1045 ; KNL_64: # BB#0:
1046 ; KNL_64-NEXT: movb $3, %al
1047 ; KNL_64-NEXT: movzbl %al, %eax
1048 ; KNL_64-NEXT: kmovw %eax, %k1
1049 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1050 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1051 ; KNL_64-NEXT: retq
1052 ;
1053 ; KNL_32-LABEL: test24:
1054 ; KNL_32: # BB#0:
1055 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1056 ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
1057 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1
1058 ; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1
1059 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1060 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1061 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1062 ; KNL_32-NEXT: retl
1063 ;
1064 ; SKX-LABEL: test24:
1065 ; SKX: # BB#0:
1066 ; SKX-NEXT: kxnorw %k1, %k1, %k1
1067 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1068 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1069 ; SKX-NEXT: retq
1070 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1071 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1072 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> , <2 x i32> undef)
1073 ret <2 x i32>%res
1074 }
1075
1076 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1077 ;
1078 ; KNL_64-LABEL: test25:
1079 ; KNL_64: # BB#0:
1080 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1081 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1082 ; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
1083 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1084 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1085 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1086 ; KNL_64-NEXT: retq
1087 ;
1088 ; KNL_32-LABEL: test25:
1089 ; KNL_32: # BB#0:
1090 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1091 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1092 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1093 ; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1
1094 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1095 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1096 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1097 ; KNL_32-NEXT: retl
1098 ;
1099 ; SKX-LABEL: test25:
1100 ; SKX: # BB#0:
1101 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1102 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1103 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1104 ; SKX-NEXT: retq
1105 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1106 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1107 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1108 ret <2 x i64>%res
1109 }
1110
1111 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1112 ;
1113 ; KNL_64-LABEL: test26:
1114 ; KNL_64: # BB#0:
1115 ; KNL_64-NEXT: movb $3, %al
1116 ; KNL_64-NEXT: movzbl %al, %eax
1117 ; KNL_64-NEXT: kmovw %eax, %k1
1118 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1119 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1120 ; KNL_64-NEXT: retq
1121 ;
1122 ; KNL_32-LABEL: test26:
1123 ; KNL_32: # BB#0:
1124 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1125 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
1126 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2
1127 ; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2
1128 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1129 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1130 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1131 ; KNL_32-NEXT: retl
1132 ;
1133 ; SKX-LABEL: test26:
1134 ; SKX: # BB#0:
1135 ; SKX-NEXT: kxnorw %k1, %k1, %k1
1136 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1137 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1138 ; SKX-NEXT: retq
1139 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1140 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1141 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> , <2 x i64> %src0)
1142 ret <2 x i64>%res
1143 }
1144
1145 ; Result type requires widening; all-ones mask
1146 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1147 ;
1148 ; KNL_64-LABEL: test27:
1149 ; KNL_64: # BB#0:
1150 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1151 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
1152 ; KNL_64-NEXT: movb $3, %al
1153 ; KNL_64-NEXT: movzbl %al, %eax
1154 ; KNL_64-NEXT: kmovw %eax, %k1
1155 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
1156 ; KNL_64-NEXT: retq
1157 ;
1158 ; KNL_32-LABEL: test27:
1159 ; KNL_32: # BB#0:
1160 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1161 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1162 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
1163 ; KNL_32-NEXT: movb $3, %cl
1164 ; KNL_32-NEXT: movzbl %cl, %ecx
1165 ; KNL_32-NEXT: kmovw %ecx, %k1
1166 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
1167 ; KNL_32-NEXT: retl
1168 ;
1169 ; SKX-LABEL: test27:
1170 ; SKX: # BB#0:
1171 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
1172 ; SKX-NEXT: movb $3, %al
1173 ; SKX-NEXT: kmovb %eax, %k1
1174 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
1175 ; SKX-NEXT: retq
1176 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1177 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1178 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> , <2 x float> undef)
1179 ret <2 x float>%res
1180 }
1181
1182 ; Data type requires promotion, mask is all-ones
1183 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1184 ;
1185 ;
1186 ; KNL_64-LABEL: test28:
1187 ; KNL_64: # BB#0:
1188 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1189 ; KNL_64-NEXT: movb $3, %al
1190 ; KNL_64-NEXT: movzbl %al, %eax
1191 ; KNL_64-NEXT: kmovw %eax, %k1
1192 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1193 ; KNL_64-NEXT: retq
1194 ;
1195 ; KNL_32-LABEL: test28:
1196 ; KNL_32: # BB#0:
1197 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1198 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
1199 ; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2
1200 ; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2
1201 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1202 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1203 ; KNL_32-NEXT: retl
1204 ;
1205 ; SKX-LABEL: test28:
1206 ; SKX: # BB#0:
1207 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1208 ; SKX-NEXT: movb $3, %al
1209 ; SKX-NEXT: kmovb %eax, %k1
1210 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1211 ; SKX-NEXT: retq
1212 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> )
1213 ret void
1214 }
1215
1216
1217 ; SCALAR-LABEL: test29
1218 ; SCALAR: extractelement <16 x float*>
2991219 ; SCALAR-NEXT: load float
3001220 ; SCALAR-NEXT: insertelement <16 x float>
3011221 ; SCALAR-NEXT: extractelement <16 x float*>
3021222 ; SCALAR-NEXT: load float
3031223
304 define <16 x float> @test15(float* %base, <16 x i32> %ind) {
1224 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1225 ; KNL_64-LABEL: test29:
1226 ; KNL_64: # BB#0:
1227 ; KNL_64-NEXT: movw $44, %ax
1228 ; KNL_64-NEXT: kmovw %eax, %k1
1229 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1230 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1231 ; KNL_64-NEXT: retq
1232 ;
1233 ; KNL_32-LABEL: test29:
1234 ; KNL_32: # BB#0:
1235 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1236 ; KNL_32-NEXT: movw $44, %cx
1237 ; KNL_32-NEXT: kmovw %ecx, %k1
1238 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1239 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1240 ; KNL_32-NEXT: retl
1241 ;
1242 ; SKX-LABEL: test29:
1243 ; SKX: # BB#0:
1244 ; SKX-NEXT: movw $44, %ax
1245 ; SKX-NEXT: kmovw %eax, %k1
1246 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1247 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1248 ; SKX-NEXT: retq
3051249
3061250 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
3071251 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
3151259
3161260 ; Check non-power-of-2 case. It should be scalarized.
3171261 declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
318 ; KNL-LABEL: test16
319 ; KNL: testb
320 ; KNL: je
321 ; KNL: testb
322 ; KNL: je
323 ; KNL: testb
324 ; KNL: je
325 define <3 x i32> @test16(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1262 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1263 ; KNL_64-LABEL: test30:
1264 ; KNL_64: # BB#0:
1265 ; KNL_64-NEXT: andl $1, %edx
1266 ; KNL_64-NEXT: kmovw %edx, %k1
1267 ; KNL_64-NEXT: andl $1, %esi
1268 ; KNL_64-NEXT: kmovw %esi, %k2
1269 ; KNL_64-NEXT: movl %edi, %eax
1270 ; KNL_64-NEXT: andl $1, %eax
1271 ; KNL_64-NEXT: kmovw %eax, %k0
1272 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
1273 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
1274 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1275 ; KNL_64-NEXT: # implicit-def: %XMM0
1276 ; KNL_64-NEXT: testb $1, %dil
1277 ; KNL_64-NEXT: je .LBB29_2
1278 ; KNL_64-NEXT: # BB#1: # %cond.load
1279 ; KNL_64-NEXT: vmovq %xmm1, %rax
1280 ; KNL_64-NEXT: vmovd (%rax), %xmm0
1281 ; KNL_64-NEXT: .LBB29_2: # %else
1282 ; KNL_64-NEXT: kmovw %k2, %eax
1283 ; KNL_64-NEXT: movl %eax, %ecx
1284 ; KNL_64-NEXT: andl $1, %ecx
1285 ; KNL_64-NEXT: testb %cl, %cl
1286 ; KNL_64-NEXT: je .LBB29_4
1287 ; KNL_64-NEXT: # BB#3: # %cond.load1
1288 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx
1289 ; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
1290 ; KNL_64-NEXT: .LBB29_4: # %else2
1291 ; KNL_64-NEXT: kmovw %k1, %ecx
1292 ; KNL_64-NEXT: movl %ecx, %edx
1293 ; KNL_64-NEXT: andl $1, %edx
1294 ; KNL_64-NEXT: testb %dl, %dl
1295 ; KNL_64-NEXT: je .LBB29_6
1296 ; KNL_64-NEXT: # BB#5: # %cond.load4
1297 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
1298 ; KNL_64-NEXT: vmovq %xmm1, %rdx
1299 ; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0
1300 ; KNL_64-NEXT: .LBB29_6: # %else5
1301 ; KNL_64-NEXT: kmovw %k0, %edx
1302 ; KNL_64-NEXT: vmovd %edx, %xmm1
1303 ; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1304 ; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
1305 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1306 ; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1307 ; KNL_64-NEXT: retq
1308 ;
1309 ; KNL_32-LABEL: test30:
1310 ; KNL_32: # BB#0:
1311 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1312 ; KNL_32-NEXT: andl $1, %eax
1313 ; KNL_32-NEXT: kmovw %eax, %k1
1314 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1315 ; KNL_32-NEXT: andl $1, %eax
1316 ; KNL_32-NEXT: kmovw %eax, %k2
1317 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1318 ; KNL_32-NEXT: movl %eax, %ecx
1319 ; KNL_32-NEXT: andl $1, %ecx
1320 ; KNL_32-NEXT: kmovw %ecx, %k0
1321 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
1322 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
1323 ; KNL_32-NEXT: # implicit-def: %XMM0
1324 ; KNL_32-NEXT: testb $1, %al
1325 ; KNL_32-NEXT: je .LBB29_2
1326 ; KNL_32-NEXT: # BB#1: # %cond.load
1327 ; KNL_32-NEXT: vmovd %xmm1, %eax
1328 ; KNL_32-NEXT: vmovd (%eax), %xmm0
1329 ; KNL_32-NEXT: .LBB29_2: # %else
1330 ; KNL_32-NEXT: kmovw %k2, %eax
1331 ; KNL_32-NEXT: movl %eax, %ecx
1332 ; KNL_32-NEXT: andl $1, %ecx
1333 ; KNL_32-NEXT: testb %cl, %cl
1334 ; KNL_32-NEXT: je .LBB29_4
1335 ; KNL_32-NEXT: # BB#3: # %cond.load1
1336 ; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
1337 ; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
1338 ; KNL_32-NEXT: .LBB29_4: # %else2
1339 ; KNL_32-NEXT: kmovw %k1, %ecx
1340 ; KNL_32-NEXT: movl %ecx, %edx
1341 ; KNL_32-NEXT: andl $1, %edx
1342 ; KNL_32-NEXT: testb %dl, %dl
1343 ; KNL_32-NEXT: je .LBB29_6
1344 ; KNL_32-NEXT: # BB#5: # %cond.load4
1345 ; KNL_32-NEXT: vpextrd $2, %xmm1, %edx
1346 ; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0
1347 ; KNL_32-NEXT: .LBB29_6: # %else5
1348 ; KNL_32-NEXT: kmovw %k0, %edx
1349 ; KNL_32-NEXT: vmovd %edx, %xmm1
1350 ; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1351 ; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
1352 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1353 ; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
1354 ; KNL_32-NEXT: retl
1355 ;
1356 ; SKX-LABEL: test30:
1357 ; SKX: # BB#0:
1358 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1359 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1360 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
1361 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
1362 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1363 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1364 ; SKX-NEXT: # implicit-def: %XMM0
1365 ; SKX-NEXT: andb $1, %al
1366 ; SKX-NEXT: je .LBB29_2
1367 ; SKX-NEXT: # BB#1: # %cond.load
1368 ; SKX-NEXT: vmovq %xmm1, %rax
1369 ; SKX-NEXT: vmovd (%rax), %xmm0
1370 ; SKX-NEXT: .LBB29_2: # %else
1371 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1372 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1373 ; SKX-NEXT: andb $1, %al
1374 ; SKX-NEXT: je .LBB29_4
1375 ; SKX-NEXT: # BB#3: # %cond.load1
1376 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
1377 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
1378 ; SKX-NEXT: .LBB29_4: # %else2
1379 ; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp)
1380 ; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al
1381 ; SKX-NEXT: andb $1, %al
1382 ; SKX-NEXT: je .LBB29_6
1383 ; SKX-NEXT: # BB#5: # %cond.load4
1384 ; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
1385 ; SKX-NEXT: vmovq %xmm1, %rax
1386 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
1387 ; SKX-NEXT: .LBB29_6: # %else5
1388 ; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
1389 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1390 ; SKX-NEXT: retq
1391
3261392 %sext_ind = sext <3 x i32> %ind to <3 x i64>
3271393 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
3281394 %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
3311397
3321398 declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
3331399
334 ; KNL-LABEL: test17
1400 ; KNL-LABEL: test31
3351401 ; KNL: vpgatherqq
3361402 ; KNL: vpgatherqq
337 define <16 x float*> @test17(<16 x float**> %ptrs) {
1403 define <16 x float*> @test31(<16 x float**> %ptrs) {
1404 ; KNL_64-LABEL: test31:
1405 ; KNL_64: # BB#0:
1406 ; KNL_64-NEXT: kxnorw %k1, %k1, %k1
1407 ; KNL_64-NEXT: kxnorw %k2, %k2, %k2
1408 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1409 ; KNL_64-NEXT: kshiftrw $8, %k1, %k1
1410 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1411 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1412 ; KNL_64-NEXT: vmovaps %zmm3, %zmm1
1413 ; KNL_64-NEXT: retq
1414 ;
1415 ; KNL_32-LABEL: test31:
1416 ; KNL_32: # BB#0:
1417 ; KNL_32-NEXT: kxnorw %k1, %k1, %k1
1418 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1419 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1420 ; KNL_32-NEXT: retl
1421 ;
1422 ; SKX-LABEL: test31:
1423 ; SKX: # BB#0:
1424 ; SKX-NEXT: kxnorw %k1, %k1, %k1
1425 ; SKX-NEXT: kxnorw %k2, %k2, %k2
1426 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1427 ; SKX-NEXT: kshiftrw $8, %k1, %k1
1428 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1429 ; SKX-NEXT: vmovaps %zmm2, %zmm0
1430 ; SKX-NEXT: vmovaps %zmm3, %zmm1
1431 ; SKX-NEXT: retq
1432 ;
1433 ; SKX_32-LABEL: test31:
1434 ; SKX_32: # BB#0:
1435 ; SKX_32-NEXT: kxnorw %k1, %k1, %k1
1436 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1437 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1438 ; SKX_32-NEXT: retl
3381439
3391440 %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> , <16 x float*> undef)
3401441 ret <16 x float*>%res
3411442 }
1443
1444 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1445 ; KNL_64-LABEL: test_gather_16i32:
1446 ; KNL_64: # BB#0:
1447 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1448 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1449 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1450 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1451 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1452 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1453 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1454 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1455 ; KNL_64-NEXT: retq
1456 ;
1457 ; KNL_32-LABEL: test_gather_16i32:
1458 ; KNL_32: # BB#0:
1459 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1460 ; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
1461 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1462 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1463 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1464 ; KNL_32-NEXT: retl
1465 ;
1466 ; SKX-LABEL: test_gather_16i32:
1467 ; SKX: # BB#0:
1468 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1469 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1470 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1471 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
1472 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1473 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1474 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1475 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
1476 ; SKX-NEXT: retq
1477 ;
1478 ; SKX_32-LABEL: test_gather_16i32:
1479 ; SKX_32: # BB#0:
1480 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1481 ; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1
1482 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1483 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1484 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1485 ; SKX_32-NEXT: retl
1486 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1487 ret <16 x i32> %res
1488 }
1489 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1490 ; KNL_64-LABEL: test_gather_16i64:
1491 ; KNL_64: # BB#0:
1492 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1493 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1494 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1495 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1496 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1497 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1498 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0
1499 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1
1500 ; KNL_64-NEXT: retq
1501 ;
1502 ; KNL_32-LABEL: test_gather_16i64:
1503 ; KNL_32: # BB#0:
1504 ; KNL_32-NEXT: pushl %ebp
1505 ; KNL_32-NEXT: .Ltmp0:
1506 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1507 ; KNL_32-NEXT: .Ltmp1:
1508 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1509 ; KNL_32-NEXT: movl %esp, %ebp
1510 ; KNL_32-NEXT: .Ltmp2:
1511 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1512 ; KNL_32-NEXT: andl $-64, %esp
1513 ; KNL_32-NEXT: subl $64, %esp
1514 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1515 ; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1
1516 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1517 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1518 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1519 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1520 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1521 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1522 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1523 ; KNL_32-NEXT: movl %ebp, %esp
1524 ; KNL_32-NEXT: popl %ebp
1525 ; KNL_32-NEXT: retl
1526 ;
1527 ; SKX-LABEL: test_gather_16i64:
1528 ; SKX: # BB#0:
1529 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1530 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1531 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1532 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1533 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1534 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1535 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1536 ; SKX-NEXT: vmovaps %zmm4, %zmm1
1537 ; SKX-NEXT: retq
1538 %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1539 ret <16 x i64> %res
1540 }
1541 declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1542 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1543 ; KNL_64-LABEL: test_gather_16f32:
1544 ; KNL_64: # BB#0:
1545 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1546 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1547 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1548 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1549 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1550 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1551 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1552 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1553 ; KNL_64-NEXT: retq
1554 ;
1555 ; KNL_32-LABEL: test_gather_16f32:
1556 ; KNL_32: # BB#0:
1557 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1558 ; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1
1559 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1560 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1561 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1562 ; KNL_32-NEXT: retl
1563 ;
1564 ; SKX-LABEL: test_gather_16f32:
1565 ; SKX: # BB#0:
1566 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1567 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1568 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1569 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
1570 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1571 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1572 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1573 ; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
1574 ; SKX-NEXT: retq
1575 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
1576 ret <16 x float> %res
1577 }
1578 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
1579 ; KNL_64-LABEL: test_gather_16f64:
1580 ; KNL_64: # BB#0:
1581 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1582 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1583 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1584 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1585 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
1586 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
1587 ; KNL_64-NEXT: vmovaps %zmm3, %zmm0
1588 ; KNL_64-NEXT: vmovaps %zmm4, %zmm1
1589 ; KNL_64-NEXT: retq
1590 ;
1591 ; KNL_32-LABEL: test_gather_16f64:
1592 ; KNL_32: # BB#0:
1593 ; KNL_32-NEXT: pushl %ebp
1594 ; KNL_32-NEXT: .Ltmp3:
1595 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1596 ; KNL_32-NEXT: .Ltmp4:
1597 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1598 ; KNL_32-NEXT: movl %esp, %ebp
1599 ; KNL_32-NEXT: .Ltmp5:
1600 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1601 ; KNL_32-NEXT: andl $-64, %esp
1602 ; KNL_32-NEXT: subl $64, %esp
1603 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1604 ; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1
1605 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1606 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
1607 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1608 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
1609 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1610 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
1611 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1612 ; KNL_32-NEXT: movl %ebp, %esp
1613 ; KNL_32-NEXT: popl %ebp
1614 ; KNL_32-NEXT: retl
1615 ;
1616 ; SKX-LABEL: test_gather_16f64:
1617 ; SKX: # BB#0:
1618 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1619 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1620 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1621 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1622 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
1623 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
1624 ; SKX-NEXT: vmovaps %zmm3, %zmm0
1625 ; SKX-NEXT: vmovaps %zmm4, %zmm1
1626 ; SKX-NEXT: retq
1627 %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
1628 ret <16 x double> %res
1629 }
1630 declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
1631 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1632 ; KNL_64-LABEL: test_scatter_16i32:
1633 ; KNL_64: # BB#0:
1634 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1635 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1636 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1637 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1638 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
1639 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
1640 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
1641 ; KNL_64-NEXT: retq
1642 ;
1643 ; KNL_32-LABEL: test_scatter_16i32:
1644 ; KNL_32: # BB#0:
1645 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1646 ; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
1647 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1648 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
1649 ; KNL_32-NEXT: retl
1650 ;
1651 ; SKX-LABEL: test_scatter_16i32:
1652 ; SKX: # BB#0:
1653 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1654 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1655 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1656 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1657 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
1658 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
1659 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
1660 ; SKX-NEXT: retq
1661 ;
1662 ; SKX_32-LABEL: test_scatter_16i32:
1663 ; SKX_32: # BB#0:
1664 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1665 ; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1
1666 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1667 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
1668 ; SKX_32-NEXT: retl
1669 call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
1670 ret void
1671 }
1672 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1673 ; KNL_64-LABEL: test_scatter_16i64:
1674 ; KNL_64: # BB#0:
1675 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1676 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1677 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1678 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1679 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
1680 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
1681 ; KNL_64-NEXT: retq
1682 ;
1683 ; KNL_32-LABEL: test_scatter_16i64:
1684 ; KNL_32: # BB#0:
1685 ; KNL_32-NEXT: pushl %ebp
1686 ; KNL_32-NEXT: .Ltmp6:
1687 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1688 ; KNL_32-NEXT: .Ltmp7:
1689 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1690 ; KNL_32-NEXT: movl %esp, %ebp
1691 ; KNL_32-NEXT: .Ltmp8:
1692 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1693 ; KNL_32-NEXT: andl $-64, %esp
1694 ; KNL_32-NEXT: subl $64, %esp
1695 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1696 ; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1
1697 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1698 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1699 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1700 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
1701 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1702 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
1703 ; KNL_32-NEXT: movl %ebp, %esp
1704 ; KNL_32-NEXT: popl %ebp
1705 ; KNL_32-NEXT: retl
1706 ;
1707 ; SKX-LABEL: test_scatter_16i64:
1708 ; SKX: # BB#0:
1709 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1710 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1711 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1712 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1713 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
1714 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
1715 ; SKX-NEXT: retq
1716 call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
1717 ret void
1718 }
1719 declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
1720 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1721 ; KNL_64-LABEL: test_scatter_16f32:
1722 ; KNL_64: # BB#0:
1723 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1724 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1725 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1726 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1727 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
1728 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
1729 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
1730 ; KNL_64-NEXT: retq
1731 ;
1732 ; KNL_32-LABEL: test_scatter_16f32:
1733 ; KNL_32: # BB#0:
1734 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1735 ; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1
1736 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1737 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
1738 ; KNL_32-NEXT: retl
1739 ;
1740 ; SKX-LABEL: test_scatter_16f32:
1741 ; SKX: # BB#0:
1742 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1743 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1744 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1745 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1746 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
1747 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
1748 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
1749 ; SKX-NEXT: retq
1750 call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
1751 ret void
1752 }
1753 declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
1754 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
1755 ; KNL_64-LABEL: test_scatter_16f64:
1756 ; KNL_64: # BB#0:
1757 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1758 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1759 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1760 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1761 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
1762 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
1763 ; KNL_64-NEXT: retq
1764 ;
1765 ; KNL_32-LABEL: test_scatter_16f64:
1766 ; KNL_32: # BB#0:
1767 ; KNL_32-NEXT: pushl %ebp
1768 ; KNL_32-NEXT: .Ltmp9:
1769 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1770 ; KNL_32-NEXT: .Ltmp10:
1771 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1772 ; KNL_32-NEXT: movl %esp, %ebp
1773 ; KNL_32-NEXT: .Ltmp11:
1774 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1775 ; KNL_32-NEXT: andl $-64, %esp
1776 ; KNL_32-NEXT: subl $64, %esp
1777 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1778 ; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1
1779 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1780 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
1781 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1782 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
1783 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1784 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
1785 ; KNL_32-NEXT: movl %ebp, %esp
1786 ; KNL_32-NEXT: popl %ebp
1787 ; KNL_32-NEXT: retl
1788 ;
1789 ; SKX-LABEL: test_scatter_16f64:
1790 ; SKX: # BB#0:
1791 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1792 ; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
1793 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1794 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1795 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
1796 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
1797 ; SKX-NEXT: retq
1798 call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
1799 ret void
1800 }
1801 declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
None ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512
1 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
2 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR
3 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s -check-prefix=SKX
0 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s --check-prefix=AVX512
1 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX2
2 ; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s --check-prefix=AVX_SCALAR
3 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=skx < %s | FileCheck %s --check-prefix=SKX
44
55 ; AVX512-LABEL: test1
66 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
273273 ; AVX2-NOT: blend
274274 ; AVX2: ret
275275 define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
276 ; SKX-LABEL: test18:
277 ; SKX: ## BB#0:
278 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
279 ; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
280 ; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
281 ; SKX-NEXT: kshiftlw $2, %k0, %k0
282 ; SKX-NEXT: kshiftrw $2, %k0, %k1
283 ; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
284 ; SKX-NEXT: retq
276285 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
277286 %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
278287 ret <2 x float> %res
362371 %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer)
363372 ret <16 x %mystruct*> %res
364373 }
374
375 define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
376 ; SKX-LABEL: test_store_16i64:
377 ; SKX: ## BB#0:
378 ; SKX-NEXT: vpmovb2m %xmm0, %k1
379 ; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
380 ; SKX-NEXT: kshiftrw $8, %k1, %k1
381 ; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1}
382 ; SKX-NEXT: retq
383 call void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
384 ret void
385 }
386 declare void @llvm.masked.store.v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask)
387 define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
388 ; SKX-LABEL: test_store_16f64:
389 ; SKX: ## BB#0:
390 ; SKX-NEXT: vpmovb2m %xmm0, %k1
391 ; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1}
392 ; SKX-NEXT: kshiftrw $8, %k1, %k1
393 ; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1}
394 ; SKX-NEXT: retq
395 call void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
396 ret void
397 }
398 declare void @llvm.masked.store.v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask)
399 define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
400 ; SKX-LABEL: test_load_16i64:
401 ; SKX: ## BB#0:
402 ; SKX-NEXT: vpmovb2m %xmm0, %k1
403 ; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1}
404 ; SKX-NEXT: kshiftrw $8, %k1, %k1
405 ; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1}
406 ; SKX-NEXT: vmovaps %zmm1, %zmm0
407 ; SKX-NEXT: vmovaps %zmm2, %zmm1
408 ; SKX-NEXT: retq
409 %res = call <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
410 ret <16 x i64> %res
411 }
412 declare <16 x i64> @llvm.masked.load.v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
413 define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
414 ; SKX-LABEL: test_load_16f64:
415 ; SKX: ## BB#0:
416 ; SKX-NEXT: vpmovb2m %xmm0, %k1
417 ; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
418 ; SKX-NEXT: kshiftrw $8, %k1, %k1
419 ; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
420 ; SKX-NEXT: vmovaps %zmm1, %zmm0
421 ; SKX-NEXT: vmovaps %zmm2, %zmm1
422 ; SKX-NEXT: retq
423 %res = call <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
424 ret <16 x double> %res
425 }
426 declare <16 x double> @llvm.masked.load.v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
427
428 define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
429 ; SKX-LABEL: test_load_32f64:
430 ; SKX: ## BB#0:
431 ; SKX-NEXT: vpmovb2m %ymm0, %k1
432 ; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1}
433 ; SKX-NEXT: kshiftrd $16, %k1, %k2
434 ; SKX-NEXT: vmovupd 128(%rdi), %zmm3 {%k2}
435 ; SKX-NEXT: kshiftrw $8, %k1, %k1
436 ; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1}
437 ; SKX-NEXT: kshiftrw $8, %k2, %k1
438 ; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1}
439 ; SKX-NEXT: vmovaps %zmm1, %zmm0
440 ; SKX-NEXT: vmovaps %zmm2, %zmm1
441 ; SKX-NEXT: vmovaps %zmm3, %zmm2
442 ; SKX-NEXT: vmovaps %zmm4, %zmm3
443 ; SKX-NEXT: retq
444 %res = call <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
445 ret <32 x double> %res
446 }
447 declare <32 x double> @llvm.masked.load.v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0)