llvm.org GIT mirror llvm / 1693ef3
[LV][X86] Support of AVX2 Gathers code generation and update the LV with this This patch depends on: https://reviews.llvm.org/D35348 Support of pattern selection of masked gathers of AVX2 (X86\AVX2 code gen) Update LoopVectorize to generate gathers for AVX2 processors. Reviewers: delena, zvi, RKSimon, craig.topper, aaboud, igorb Reviewed By: delena, RKSimon Differential Revision: https://reviews.llvm.org/D35772 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318641 91177308-0d34-0410-b5e6-96231b3b80d8 Mohammed Agabaria 2 years ago
7 changed file(s) with 347 addition(s) and 782 deletion(s). Raw diff Collapse all Expand all
968968 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
969969 setOperationAction(ISD::BITREVERSE, VT, Custom);
970970 }
971
972 // Special handling for masked gather of 2 elements
973 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
974 setOperationAction(ISD::MGATHER, MVT::v2i64, Custom);
971975
972976 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
973977 bool HasInt256 = Subtarget.hasInt256();
2430024304
2430124305 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
2430224306 SelectionDAG &DAG) {
24303 assert(Subtarget.hasAVX512() &&
24304 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24307 assert(Subtarget.hasAVX2() &&
24308 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
2430524309
2430624310 MaskedGatherSDNode *N = cast(Op.getNode());
2430724311 SDLoc dl(Op);
2431524319 unsigned NumElts = VT.getVectorNumElements();
2431624320 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
2431724321
24318 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24322 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
2431924323 !Index.getSimpleValueType().is512BitVector()) {
2432024324 // AVX512F supports only 512-bit vectors. Or data or index should
2432124325 // be 512 bit wide. If now the both index and data are 256-bit, but
2435824362 SDValue RetOps[] = {Extract, NewGather.getValue(1)};
2435924363 return DAG.getMergeValues(RetOps, dl);
2436024364 }
24361 if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
24365 if (N->getMemoryVT() == MVT::v2i32) {
2436224366 // There is a special case when the return type is v2i32 is illegal and
2436324367 // the type legaizer extended it to v2i64. Without this conversion we end up
2436424368 // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
2436624370 // with index v2i64 and value type v4i32.
2436724371 assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
2436824372 "Unexpected type in masked gather");
24369 Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
24370 DAG.getBitcast(MVT::v4i32, Src0),
24371 DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
24373 Src0 =
24374 DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src0),
24375 DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
2437224376 // The mask should match the destination type. Extending mask with zeroes
2437324377 // is not necessary since instruction itself reads only two values from
2437424378 // memory.
24379 SDVTList VTList;
24380 if (Subtarget.hasVLX()) {
24381 Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
24382 VTList = DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other);
24383 }
24384 else {
24385 Mask =
24386 DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Mask),
24387 DAG.getUNDEF(MVT::v4i32), {0, 2, -1, -1});
24388 VTList = DAG.getVTList(MVT::v4i32, MVT::Other);
24389 }
2437524390 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
2437624391 SDValue NewGather = DAG.getTargetMemSDNode(
24377 DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other), Ops, dl,
24378 N->getMemoryVT(), N->getMemOperand());
24392 VTList, Ops, dl, N->getMemoryVT(), N->getMemOperand());
2437924393
2438024394 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
2438124395 NewGather.getValue(0), DAG);
11001100 (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
11011101 return cast(N)->getMemoryVT().getScalarType() == MVT::i32;
11021102 }]>;
1103
1104 // AVX2 special nodes
1105 // masked gather of AVX2 where mask elements are i32
1106 def avx2_x86_masked_gather_32 : SDNode<"X86ISD::MGATHER",
1107 SDTypeProfile<2, 3, [
1108 SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
1109 SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>,
1110 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
1111
1112 def avx2_masked_gather_32 : SDNode<"ISD::MGATHER",
1113 SDTypeProfile<2, 3, [
1114 SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
1115 SDTCisPtrTy<4>, SDTCVecEltisVT<1, i32>, SDTCisSameNumEltsAs<0, 1>]>,
1116 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
1117
1118 // masked gather of AVX2 where mask elements are i64
1119 def avx2_masked_gather_64 : SDNode<"ISD::MGATHER",
1120 SDTypeProfile<2, 3, [
1121 SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<1, 3>,
1122 SDTCisPtrTy<4>, SDTCVecEltisVT<1, i64>, SDTCisSameNumEltsAs<0, 1>]>,
1123 [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
1124
1125 // dword gathers
1126 def avx2_mvpgatherdd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
1127 (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
1128 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
1129 return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
1130 Mgt->getBasePtr().getValueType() == MVT::v4i32);
1131 return false;
1132 }]>;
1133
1134 def avx2_mvpgatherqd_ps_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
1135 (avx2_x86_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
1136 if (X86MaskedGatherSDNode *Mgt = dyn_cast(N))
1137 return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
1138 Mgt->getBasePtr().getValueType() == MVT::v2i64);
1139 return false;
1140 }]>;
1141
1142 def avx2_mvpgatherdd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
1143 (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
1144 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
1145 return (Mgt->getIndex().getValueType() == MVT::v8i32 ||
1146 Mgt->getBasePtr().getValueType() == MVT::v8i32);
1147 return false;
1148 }]>;
1149
1150 def avx2_mvpgatherqd_ps_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
1151 (avx2_masked_gather_32 node:$src1, node:$src2, node:$src3) , [{
1152 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
1153 return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
1154 Mgt->getBasePtr().getValueType() == MVT::v4i64);
1155 return false;
1156 }]>;
1157
1158 // qwords
1159 def avx2_mvpgatherdq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
1160 (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
1161 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
1162 return (Mgt->getIndex().getValueType() == MVT::v2i32 ||
1163 Mgt->getBasePtr().getValueType() == MVT::v2i32);
1164 return false;
1165 }]>;
1166
1167 def avx2_mvpgatherqq_pd_xmm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
1168 (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
1169 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
1170 return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
1171 Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
1172 Mgt->getMemoryVT().is128BitVector();
1173 return false;
1174 }]>;
1175
1176 def avx2_mvpgatherdq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
1177 (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
1178 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
1179 return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
1180 Mgt->getBasePtr().getValueType() == MVT::v4i32);
1181 return false;
1182 }]>;
1183
1184 def avx2_mvpgatherqq_pd_ymm : PatFrag<(ops node:$src1, node:$src2, node:$src3),
1185 (avx2_masked_gather_64 node:$src1, node:$src2, node:$src3) , [{
1186 if (MaskedGatherSDNode *Mgt = dyn_cast(N))
1187 return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
1188 Mgt->getBasePtr().getValueType() == MVT::v4i64);
1189 return false;
1190 }]>;
83258325
83268326 //===----------------------------------------------------------------------===//
83278327 // VGATHER - GATHER Operations
8328 multiclass avx2_gather opc, string OpcodeStr, RegisterClass RC256,
8328 multiclass avx2_gather opc, string OpcodeStr, ValueType VTx,
8329 ValueType VTy, PatFrag GatherNode128,
8330 PatFrag GatherNode256, RegisterClass RC256,
83298331 X86MemOperand memop128, X86MemOperand memop256> {
83308332 def rm : AVX28I
83318333 (ins VR128:$src1, memop128:$src2, VR128:$mask),
83328334 !strconcat(OpcodeStr,
83338335 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8334 []>, VEX;
8336 [(set (VTx VR128:$dst), VR128:$mask_wb,
8337 (GatherNode128 (VTx VR128:$src1), VR128:$mask,
8338 vectoraddr:$src2))]>, VEX;
83358339 def Yrm : AVX28I
83368340 (ins RC256:$src1, memop256:$src2, RC256:$mask),
83378341 !strconcat(OpcodeStr,
83388342 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8339 []>, VEX, VEX_L;
8340 }
8341
8342 let mayLoad = 1, hasSideEffects = 0, Constraints
8343 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8344 in {
8345 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
8346 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
8347 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
8348 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;
8349
8350 let ExeDomain = SSEPackedDouble in {
8351 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
8352 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
8353 }
8354
8355 let ExeDomain = SSEPackedSingle in {
8356 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
8357 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
8343 [(set (VTy RC256:$dst), RC256:$mask_wb,
8344 (GatherNode256 (VTy RC256:$src1), RC256:$mask,
8345 vectoraddr:$src2))]>, VEX, VEX_L;
8346 }
8347
8348 let Predicates = [UseAVX2] in {
8349 let mayLoad = 1, hasSideEffects = 0, Constraints
8350 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8351 in {
8352 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, avx2_mvpgatherdq_pd_xmm,
8353 avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W;
8354 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, avx2_mvpgatherqq_pd_xmm,
8355 avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W;
8356 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, avx2_mvpgatherdd_ps_xmm,
8357 avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>;
8358 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, avx2_mvpgatherqd_ps_xmm,
8359 avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>;
8360
8361 let ExeDomain = SSEPackedDouble in {
8362 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, avx2_mvpgatherdq_pd_xmm,
8363 avx2_mvpgatherdq_pd_ymm, VR256, vx128mem, vx256mem>, VEX_W;
8364 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, avx2_mvpgatherqq_pd_xmm,
8365 avx2_mvpgatherqq_pd_ymm, VR256, vx128mem, vy256mem>, VEX_W;
8366 }
8367
8368 let ExeDomain = SSEPackedSingle in {
8369 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, avx2_mvpgatherdd_ps_xmm,
8370 avx2_mvpgatherdd_ps_ymm, VR256, vx128mem, vy256mem>;
8371 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, avx2_mvpgatherqd_ps_xmm,
8372 avx2_mvpgatherqd_ps_ymm, VR128, vx64mem, vy128mem>;
8373 }
83588374 }
83598375 }
83608376
5050 } // end namespace PICStyles
5151
5252 class X86Subtarget final : public X86GenSubtargetInfo {
53 protected:
54 enum X86SSEEnum {
55 NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
56 };
57
58 enum X863DNowEnum {
59 NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
60 };
61
53 public:
6254 enum X86ProcFamilyEnum {
63 Others,
55 Others,
6456 IntelAtom,
6557 IntelSLM,
6658 IntelGLM,
7163 IntelSKX,
7264 IntelCannonlake,
7365 IntelIcelake,
66 };
67
68 protected:
69 enum X86SSEEnum {
70 NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
71 };
72
73 enum X863DNowEnum {
74 NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
7475 };
7576
7677 /// X86 processor family: Intel Atom, and others
23672367
23682368 // Trying to reduce IndexSize to 32 bits for vector 16.
23692369 // By default the IndexSize is equal to pointer size.
2370 unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
2371 DL.getPointerSizeInBits();
2370 unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
2371 ? getIndexSizeInBits(Ptr, DL)
2372 : DL.getPointerSizeInBits();
23722373
23732374 Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
23742375 IndexSize), VF);
23842385
23852386 // The gather / scatter cost is given by Intel architects. It is a rough
23862387 // number since we are looking at one instruction in a time.
2387 const int GSOverhead = 2;
2388 const int GSOverhead = (Opcode == Instruction::Load)
2389 ? ST->getGatherOverhead()
2390 : ST->getScatterOverhead();
23882391 return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
23892392 Alignment, AddressSpace);
23902393 }
24552458 // the mask vector will add more instructions. Right now we give the scalar
24562459 // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
24572460 // is better in the VariableMask case.
2458 if (VF == 2 || (VF == 4 && !ST->hasVLX()))
2461 if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
24592462 Scalarize = true;
24602463
24612464 if (Scalarize)
25142517 int DataWidth = isa(ScalarTy) ?
25152518 DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
25162519
2517 // AVX-512 allows gather and scatter
2518 return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
2520 // AVX-512 and Skylake AVX2 allows gather and scatter
2521 return (DataWidth == 32 || DataWidth == 64) && (ST->hasAVX512() ||
2522 ST->getProcFamily() == X86Subtarget::IntelSkylake);
25192523 }
25202524
25212525 bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
2526 // AVX2 doesn't support scatter
2527 if (!ST->hasAVX512())
2528 return false;
25222529 return isLegalMaskedGather(DataType);
25232530 }
25242531
0 ; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s --check-prefix=AVX2
1 ; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skylake -cost-model -analyze < %s | FileCheck %s --check-prefix=SKL
12 ; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck %s --check-prefix=KNL
23 ; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s | FileCheck %s --check-prefix=SKX
34
7172 ; AVX2-LABEL: test_gather_2f64
7273 ; AVX2: Found an estimated cost of 7 {{.*}}.gather
7374
75 ; SKL-LABEL: test_gather_2f64
76 ; SKL: Found an estimated cost of 4 {{.*}}.gather
77
7478 ; KNL-LABEL: test_gather_2f64
7579 ; KNL: Found an estimated cost of 7 {{.*}}.gather
7680
8791 ; AVX2-LABEL: test_gather_4i32
8892 ; AVX2: Found an estimated cost of 16 {{.*}}.gather
8993
94 ; SKL-LABEL: test_gather_4i32
95 ; SKL: Found an estimated cost of 6 {{.*}}.gather
96
9097 ; KNL-LABEL: test_gather_4i32
9198 ; KNL: Found an estimated cost of 16 {{.*}}.gather
9299
102109 ; AVX2-LABEL: test_gather_4i32_const_mask
103110 ; AVX2: Found an estimated cost of 8 {{.*}}.gather
104111
112 ; SKL-LABEL: test_gather_4i32_const_mask
113 ; SKL: Found an estimated cost of 6 {{.*}}.gather
114
105115 ; KNL-LABEL: test_gather_4i32_const_mask
106116 ; KNL: Found an estimated cost of 8 {{.*}}.gather
107117
118128 ; AVX2-LABEL: test_gather_16f32_const_mask
119129 ; AVX2: Found an estimated cost of 30 {{.*}}.gather
120130
131 ; SKL-LABEL: test_gather_16f32_const_mask
132 ; SKL: Found an estimated cost of 24 {{.*}}.gather
133
121134 ; KNL-LABEL: test_gather_16f32_const_mask
122135 ; KNL: Found an estimated cost of 18 {{.*}}.gather
123136
136149 ; AVX2-LABEL: test_gather_16f32_var_mask
137150 ; AVX2: Found an estimated cost of 62 {{.*}}.gather
138151
152 ; SKL-LABEL: test_gather_16f32_var_mask
153 ; SKL: Found an estimated cost of 24 {{.*}}.gather
154
139155 ; KNL-LABEL: test_gather_16f32_var_mask
140156 ; KNL: Found an estimated cost of 18 {{.*}}.gather
141157
154170 ; AVX2-LABEL: test_gather_16f32_ra_var_mask
155171 ; AVX2: Found an estimated cost of 62 {{.*}}.gather
156172
173 ; SKL-LABEL: test_gather_16f32_ra_var_mask
174 ; SKL: Found an estimated cost of 24 {{.*}}.gather
175
157176 ; KNL-LABEL: test_gather_16f32_ra_var_mask
158177 ; KNL: Found an estimated cost of 20 {{.*}}.gather
159178
171190
172191 ; AVX2-LABEL: test_gather_16f32_const_mask2
173192 ; AVX2: Found an estimated cost of 30 {{.*}}.gather
193
194 ; SKL-LABEL: test_gather_16f32_const_mask2
195 ; SKL: Found an estimated cost of 24 {{.*}}.gather
174196
175197 ; KNL-LABEL: test_gather_16f32_const_mask2
176198 ; KNL: Found an estimated cost of 18 {{.*}}.gather
192214 ; AVX2-LABEL: test_scatter_16i32
193215 ; AVX2: Found an estimated cost of 64 {{.*}}.scatter
194216
217 ; SKL-LABEL: test_scatter_16i32
218 ; SKL: Found an estimated cost of 64 {{.*}}.scatter
219
195220 ; KNL-LABEL: test_scatter_16i32
196221 ; KNL: Found an estimated cost of 18 {{.*}}.scatter
197222
211236 ; AVX2-LABEL: test_scatter_8i32
212237 ; AVX2: Found an estimated cost of 32 {{.*}}.scatter
213238
239 ; SKL-LABEL: test_scatter_8i32
240 ; SKL: Found an estimated cost of 32 {{.*}}.scatter
241
214242 ; KNL-LABEL: test_scatter_8i32
215243 ; KNL: Found an estimated cost of 10 {{.*}}.scatter
216244
227255 ; AVX2-LABEL: test_scatter_4i32
228256 ; AVX2: Found an estimated cost of 16 {{.*}}.scatter
229257
258 ; SKL-LABEL: test_scatter_4i32
259 ; SKL: Found an estimated cost of 16 {{.*}}.scatter
260
230261 ; KNL-LABEL: test_scatter_4i32
231262 ; KNL: Found an estimated cost of 16 {{.*}}.scatter
232263
242273 ; AVX2-LABEL: test_gather_4f32
243274 ; AVX2: Found an estimated cost of 15 {{.*}}.gather
244275
276 ; SKL-LABEL: test_gather_4f32
277 ; SKL: Found an estimated cost of 6 {{.*}}.gather
278
245279 ; KNL-LABEL: test_gather_4f32
246280 ; KNL: Found an estimated cost of 15 {{.*}}.gather
247281
259293
260294 ; AVX2-LABEL: test_gather_4f32_const_mask
261295 ; AVX2: Found an estimated cost of 7 {{.*}}.gather
296
297 ; SKL-LABEL: test_gather_4f32_const_mask
298 ; SKL: Found an estimated cost of 6 {{.*}}.gather
262299
263300 ; KNL-LABEL: test_gather_4f32_const_mask
264301 ; KNL: Found an estimated cost of 7 {{.*}}.gather
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X86 %s
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X64 %s
1 ; RUN: llc < %s -mcpu=skylake -mtriple=i386-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X86 %s
2 ; RUN: llc < %s -mcpu=skylake -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X64 %s
33
44 declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i32> %passthro)
55
77 ; X86-LABEL: masked_gather_v2i32:
88 ; X86: # BB#0: # %entry
99 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
10 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
11 ; X86-NEXT: vpextrb $0, %xmm0, %eax
12 ; X86-NEXT: testb $1, %al
13 ; X86-NEXT: # implicit-def: %XMM2
14 ; X86-NEXT: je .LBB0_2
15 ; X86-NEXT: # BB#1: # %cond.load
16 ; X86-NEXT: vmovd %xmm3, %eax
17 ; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
18 ; X86-NEXT: .LBB0_2: # %else
19 ; X86-NEXT: vpextrb $8, %xmm0, %eax
20 ; X86-NEXT: testb $1, %al
21 ; X86-NEXT: je .LBB0_4
22 ; X86-NEXT: # BB#3: # %cond.load1
23 ; X86-NEXT: vpextrd $2, %xmm3, %eax
24 ; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
25 ; X86-NEXT: .LBB0_4: # %else2
26 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0
27 ; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
10 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
11 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
12 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
13 ; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
14 ; X86-NEXT: vpmovsxdq %xmm1, %xmm0
2815 ; X86-NEXT: retl
2916 ;
3017 ; X64-LABEL: masked_gather_v2i32:
3118 ; X64: # BB#0: # %entry
32 ; X64-NEXT: vmovdqa (%rdi), %xmm3
33 ; X64-NEXT: vpextrb $0, %xmm0, %eax
34 ; X64-NEXT: testb $1, %al
35 ; X64-NEXT: # implicit-def: %XMM2
36 ; X64-NEXT: je .LBB0_2
37 ; X64-NEXT: # BB#1: # %cond.load
38 ; X64-NEXT: vmovq %xmm3, %rax
39 ; X64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
40 ; X64-NEXT: .LBB0_2: # %else
41 ; X64-NEXT: vpextrb $8, %xmm0, %eax
42 ; X64-NEXT: testb $1, %al
43 ; X64-NEXT: je .LBB0_4
44 ; X64-NEXT: # BB#3: # %cond.load1
45 ; X64-NEXT: vpextrq $1, %xmm3, %rax
46 ; X64-NEXT: movl (%rax), %eax
47 ; X64-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2
48 ; X64-NEXT: .LBB0_4: # %else2
49 ; X64-NEXT: vpsllq $63, %xmm0, %xmm0
50 ; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
19 ; X64-NEXT: vmovdqa (%rdi), %xmm2
20 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
21 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
22 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
23 ; X64-NEXT: vpmovsxdq %xmm1, %xmm0
5124 ; X64-NEXT: retq
5225 entry:
5326 %ld = load <2 x i32*>, <2 x i32*>* %ptr
5528 ret <2 x i32> %res
5629 }
5730
31 define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
32 ; X86-LABEL: masked_gather_v2i32_concat:
33 ; X86: # BB#0: # %entry
34 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
35 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
36 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
37 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
38 ; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
39 ; X86-NEXT: vpmovsxdq %xmm1, %xmm0
40 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
41 ; X86-NEXT: retl
42 ;
43 ; X64-LABEL: masked_gather_v2i32_concat:
44 ; X64: # BB#0: # %entry
45 ; X64-NEXT: vmovdqa (%rdi), %xmm2
46 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
47 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
48 ; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
49 ; X64-NEXT: vpmovsxdq %xmm1, %xmm0
50 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
51 ; X64-NEXT: retq
52 entry:
53 %ld = load <2 x i32*>, <2 x i32*>* %ptr
54 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)
55 %res2 = shufflevector <2 x i32> %res, <2 x i32> undef, <4 x i32>
56 ret <4 x i32> %res2
57 }
58
5859 declare <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthro)
5960
6061 define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) {
6162 ; X86-LABEL: masked_gather_v2float:
6263 ; X86: # BB#0: # %entry
63 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
64 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
65 ; X86-NEXT: vpextrb $0, %xmm0, %eax
66 ; X86-NEXT: testb $1, %al
67 ; X86-NEXT: # implicit-def: %XMM2
68 ; X86-NEXT: je .LBB1_2
69 ; X86-NEXT: # BB#1: # %cond.load
70 ; X86-NEXT: vmovd %xmm3, %eax
71 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
72 ; X86-NEXT: .LBB1_2: # %else
73 ; X86-NEXT: vpextrb $8, %xmm0, %eax
74 ; X86-NEXT: testb $1, %al
75 ; X86-NEXT: je .LBB1_4
76 ; X86-NEXT: # BB#3: # %cond.load1
77 ; X86-NEXT: vpextrd $2, %xmm3, %eax
78 ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
79 ; X86-NEXT: .LBB1_4: # %else2
80 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
81 ; X86-NEXT: vpslld $31, %xmm0, %xmm0
82 ; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
64 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
65 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
66 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
67 ; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1
68 ; X86-NEXT: vmovaps %xmm1, %xmm0
8369 ; X86-NEXT: retl
8470 ;
8571 ; X64-LABEL: masked_gather_v2float:
8672 ; X64: # BB#0: # %entry
87 ; X64-NEXT: vmovdqa (%rdi), %xmm3
88 ; X64-NEXT: vpextrb $0, %xmm0, %eax
89 ; X64-NEXT: testb $1, %al
90 ; X64-NEXT: # implicit-def: %XMM2
91 ; X64-NEXT: je .LBB1_2
92 ; X64-NEXT: # BB#1: # %cond.load
93 ; X64-NEXT: vmovq %xmm3, %rax
94 ; X64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
95 ; X64-NEXT: .LBB1_2: # %else
96 ; X64-NEXT: vpextrb $8, %xmm0, %eax
97 ; X64-NEXT: testb $1, %al
98 ; X64-NEXT: je .LBB1_4
99 ; X64-NEXT: # BB#3: # %cond.load1
100 ; X64-NEXT: vpextrq $1, %xmm3, %rax
101 ; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
102 ; X64-NEXT: .LBB1_4: # %else2
103 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
104 ; X64-NEXT: vpslld $31, %xmm0, %xmm0
105 ; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
73 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
74 ; X64-NEXT: vmovaps (%rdi), %xmm2
75 ; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
76 ; X64-NEXT: vmovaps %xmm1, %xmm0
77 ; X64-NEXT: vzeroupper
10678 ; X64-NEXT: retq
10779 entry:
10880 %ld = load <2 x float*>, <2 x float*>* %ptr
11082 ret <2 x float> %res
11183 }
11284
85 define <4 x float> @masked_gather_v2float_concat(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) {
86 ; X86-LABEL: masked_gather_v2float_concat:
87 ; X86: # BB#0: # %entry
88 ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
89 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
90 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
91 ; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1
92 ; X86-NEXT: vmovaps %xmm1, %xmm0
93 ; X86-NEXT: retl
94 ;
95 ; X64-LABEL: masked_gather_v2float_concat:
96 ; X64: # BB#0: # %entry
97 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
98 ; X64-NEXT: vmovaps (%rdi), %xmm2
99 ; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
100 ; X64-NEXT: vmovaps %xmm1, %xmm0
101 ; X64-NEXT: vzeroupper
102 ; X64-NEXT: retq
103 entry:
104 %ld = load <2 x float*>, <2 x float*>* %ptr
105 %res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)
106 %res2 = shufflevector <2 x float> %res, <2 x float> undef, <4 x i32>
107 ret <4 x float> %res2
108 }
109
110
113111 declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthro)
114112
115113 define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) {
116114 ; X86-LABEL: masked_gather_v4i32:
117115 ; X86: # BB#0: # %entry
118 ; X86-NEXT: vpextrb $0, %xmm1, %eax
119 ; X86-NEXT: testb $1, %al
120 ; X86-NEXT: # implicit-def: %XMM3
121 ; X86-NEXT: je .LBB2_2
122 ; X86-NEXT: # BB#1: # %cond.load
123 ; X86-NEXT: vmovd %xmm0, %eax
124 ; X86-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
125 ; X86-NEXT: .LBB2_2: # %else
126 ; X86-NEXT: vpextrb $4, %xmm1, %eax
127 ; X86-NEXT: testb $1, %al
128 ; X86-NEXT: je .LBB2_4
129 ; X86-NEXT: # BB#3: # %cond.load1
130 ; X86-NEXT: vpextrd $1, %xmm0, %eax
131 ; X86-NEXT: vpinsrd $1, (%eax), %xmm3, %xmm3
132 ; X86-NEXT: .LBB2_4: # %else2
133 ; X86-NEXT: vpextrb $8, %xmm1, %eax
134 ; X86-NEXT: testb $1, %al
135 ; X86-NEXT: je .LBB2_6
136 ; X86-NEXT: # BB#5: # %cond.load4
137 ; X86-NEXT: vpextrd $2, %xmm0, %eax
138 ; X86-NEXT: vpinsrd $2, (%eax), %xmm3, %xmm3
139 ; X86-NEXT: .LBB2_6: # %else5
140 ; X86-NEXT: vpextrb $12, %xmm1, %eax
141 ; X86-NEXT: testb $1, %al
142 ; X86-NEXT: je .LBB2_8
143 ; X86-NEXT: # BB#7: # %cond.load7
144 ; X86-NEXT: vpextrd $3, %xmm0, %eax
145 ; X86-NEXT: vpinsrd $3, (%eax), %xmm3, %xmm3
146 ; X86-NEXT: .LBB2_8: # %else8
147 ; X86-NEXT: vpslld $31, %xmm1, %xmm0
148 ; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
116 ; X86-NEXT: vpgatherdd %xmm1, (,%xmm0), %xmm2
117 ; X86-NEXT: vmovdqa %xmm2, %xmm0
149118 ; X86-NEXT: retl
150119 ;
151120 ; X64-LABEL: masked_gather_v4i32:
152121 ; X64: # BB#0: # %entry
153 ; X64-NEXT: vpextrb $0, %xmm1, %eax
154 ; X64-NEXT: testb $1, %al
155 ; X64-NEXT: # implicit-def: %XMM3
156 ; X64-NEXT: je .LBB2_2
157 ; X64-NEXT: # BB#1: # %cond.load
158 ; X64-NEXT: vmovq %xmm0, %rax
159 ; X64-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
160 ; X64-NEXT: .LBB2_2: # %else
161 ; X64-NEXT: vpextrb $4, %xmm1, %eax
162 ; X64-NEXT: testb $1, %al
163 ; X64-NEXT: je .LBB2_4
164 ; X64-NEXT: # BB#3: # %cond.load1
165 ; X64-NEXT: vpextrq $1, %xmm0, %rax
166 ; X64-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
167 ; X64-NEXT: .LBB2_4: # %else2
168 ; X64-NEXT: vpextrb $8, %xmm1, %eax
169 ; X64-NEXT: testb $1, %al
170 ; X64-NEXT: je .LBB2_6
171 ; X64-NEXT: # BB#5: # %cond.load4
172 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm4
173 ; X64-NEXT: vmovq %xmm4, %rax
174 ; X64-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
175 ; X64-NEXT: .LBB2_6: # %else5
176 ; X64-NEXT: vpextrb $12, %xmm1, %eax
177 ; X64-NEXT: testb $1, %al
178 ; X64-NEXT: je .LBB2_8
179 ; X64-NEXT: # BB#7: # %cond.load7
180 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
181 ; X64-NEXT: vpextrq $1, %xmm0, %rax
182 ; X64-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
183 ; X64-NEXT: .LBB2_8: # %else8
184 ; X64-NEXT: vpslld $31, %xmm1, %xmm0
185 ; X64-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
122 ; X64-NEXT: vpgatherqd %xmm1, (,%ymm0), %xmm2
123 ; X64-NEXT: vmovdqa %xmm2, %xmm0
186124 ; X64-NEXT: vzeroupper
187125 ; X64-NEXT: retq
188126 entry:
195133 define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) {
196134 ; X86-LABEL: masked_gather_v4float:
197135 ; X86: # BB#0: # %entry
198 ; X86-NEXT: vpextrb $0, %xmm1, %eax
199 ; X86-NEXT: testb $1, %al
200 ; X86-NEXT: # implicit-def: %XMM3
201 ; X86-NEXT: je .LBB3_2
202 ; X86-NEXT: # BB#1: # %cond.load
203 ; X86-NEXT: vmovd %xmm0, %eax
204 ; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
205 ; X86-NEXT: .LBB3_2: # %else
206 ; X86-NEXT: vpextrb $4, %xmm1, %eax
207 ; X86-NEXT: testb $1, %al
208 ; X86-NEXT: je .LBB3_4
209 ; X86-NEXT: # BB#3: # %cond.load1
210 ; X86-NEXT: vpextrd $1, %xmm0, %eax
211 ; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
212 ; X86-NEXT: .LBB3_4: # %else2
213 ; X86-NEXT: vpextrb $8, %xmm1, %eax
214 ; X86-NEXT: testb $1, %al
215 ; X86-NEXT: je .LBB3_6
216 ; X86-NEXT: # BB#5: # %cond.load4
217 ; X86-NEXT: vpextrd $2, %xmm0, %eax
218 ; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
219 ; X86-NEXT: .LBB3_6: # %else5
220 ; X86-NEXT: vpextrb $12, %xmm1, %eax
221 ; X86-NEXT: testb $1, %al
222 ; X86-NEXT: je .LBB3_8
223 ; X86-NEXT: # BB#7: # %cond.load7
224 ; X86-NEXT: vpextrd $3, %xmm0, %eax
225 ; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
226 ; X86-NEXT: .LBB3_8: # %else8
227 ; X86-NEXT: vpslld $31, %xmm1, %xmm0
228 ; X86-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
136 ; X86-NEXT: vgatherdps %xmm1, (,%xmm0), %xmm2
137 ; X86-NEXT: vmovaps %xmm2, %xmm0
229138 ; X86-NEXT: retl
230139 ;
231140 ; X64-LABEL: masked_gather_v4float:
232141 ; X64: # BB#0: # %entry
233 ; X64-NEXT: vpextrb $0, %xmm1, %eax
234 ; X64-NEXT: testb $1, %al
235 ; X64-NEXT: # implicit-def: %XMM3
236 ; X64-NEXT: je .LBB3_2
237 ; X64-NEXT: # BB#1: # %cond.load
238 ; X64-NEXT: vmovq %xmm0, %rax
239 ; X64-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
240 ; X64-NEXT: .LBB3_2: # %else
241 ; X64-NEXT: vpextrb $4, %xmm1, %eax
242 ; X64-NEXT: testb $1, %al
243 ; X64-NEXT: je .LBB3_4
244 ; X64-NEXT: # BB#3: # %cond.load1
245 ; X64-NEXT: vpextrq $1, %xmm0, %rax
246 ; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
247 ; X64-NEXT: .LBB3_4: # %else2
248 ; X64-NEXT: vpextrb $8, %xmm1, %eax
249 ; X64-NEXT: testb $1, %al
250 ; X64-NEXT: je .LBB3_6
251 ; X64-NEXT: # BB#5: # %cond.load4
252 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm4
253 ; X64-NEXT: vmovq %xmm4, %rax
254 ; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
255 ; X64-NEXT: .LBB3_6: # %else5
256 ; X64-NEXT: vpextrb $12, %xmm1, %eax
257 ; X64-NEXT: testb $1, %al
258 ; X64-NEXT: je .LBB3_8
259 ; X64-NEXT: # BB#7: # %cond.load7
260 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm0
261 ; X64-NEXT: vpextrq $1, %xmm0, %rax
262 ; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
263 ; X64-NEXT: .LBB3_8: # %else8
264 ; X64-NEXT: vpslld $31, %xmm1, %xmm0
265 ; X64-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
142 ; X64-NEXT: vgatherqps %xmm1, (,%ymm0), %xmm2
143 ; X64-NEXT: vmovaps %xmm2, %xmm0
266144 ; X64-NEXT: vzeroupper
267145 ; X64-NEXT: retq
268146 entry:
275153 define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i32> %passthro) {
276154 ; X86-LABEL: masked_gather_v8i32:
277155 ; X86: # BB#0: # %entry
278 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
279 ; X86-NEXT: vmovdqa (%eax), %ymm3
280 ; X86-NEXT: vpextrb $0, %xmm0, %eax
281 ; X86-NEXT: testb $1, %al
282 ; X86-NEXT: # implicit-def: %YMM2
283 ; X86-NEXT: je .LBB4_2
284 ; X86-NEXT: # BB#1: # %cond.load
285 ; X86-NEXT: vmovd %xmm3, %eax
286 ; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
287 ; X86-NEXT: .LBB4_2: # %else
288 ; X86-NEXT: vpextrb $2, %xmm0, %eax
289 ; X86-NEXT: testb $1, %al
290 ; X86-NEXT: je .LBB4_4
291 ; X86-NEXT: # BB#3: # %cond.load1
292 ; X86-NEXT: vpextrd $1, %xmm3, %eax
293 ; X86-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm4
294 ; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
295 ; X86-NEXT: .LBB4_4: # %else2
296 ; X86-NEXT: vpextrb $4, %xmm0, %eax
297 ; X86-NEXT: testb $1, %al
298 ; X86-NEXT: je .LBB4_6
299 ; X86-NEXT: # BB#5: # %cond.load4
300 ; X86-NEXT: vpextrd $2, %xmm3, %eax
301 ; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm4
302 ; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
303 ; X86-NEXT: .LBB4_6: # %else5
304 ; X86-NEXT: vpextrb $6, %xmm0, %eax
305 ; X86-NEXT: testb $1, %al
306 ; X86-NEXT: je .LBB4_8
307 ; X86-NEXT: # BB#7: # %cond.load7
308 ; X86-NEXT: vpextrd $3, %xmm3, %eax
309 ; X86-NEXT: vpinsrd $3, (%eax), %xmm2, %xmm4
310 ; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
311 ; X86-NEXT: .LBB4_8: # %else8
312 ; X86-NEXT: vpextrb $8, %xmm0, %eax
313 ; X86-NEXT: testb $1, %al
314 ; X86-NEXT: je .LBB4_10
315 ; X86-NEXT: # BB#9: # %cond.load10
316 ; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
317 ; X86-NEXT: vmovd %xmm4, %eax
318 ; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
319 ; X86-NEXT: vpinsrd $0, (%eax), %xmm4, %xmm4
320 ; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
321 ; X86-NEXT: .LBB4_10: # %else11
322 ; X86-NEXT: vpextrb $10, %xmm0, %eax
323 ; X86-NEXT: testb $1, %al
324 ; X86-NEXT: je .LBB4_12
325 ; X86-NEXT: # BB#11: # %cond.load13
326 ; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
327 ; X86-NEXT: vpextrd $1, %xmm4, %eax
328 ; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
329 ; X86-NEXT: vpinsrd $1, (%eax), %xmm4, %xmm4
330 ; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
331 ; X86-NEXT: .LBB4_12: # %else14
332 ; X86-NEXT: vpextrb $12, %xmm0, %eax
333 ; X86-NEXT: testb $1, %al
334 ; X86-NEXT: je .LBB4_14
335 ; X86-NEXT: # BB#13: # %cond.load16
336 ; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
337 ; X86-NEXT: vpextrd $2, %xmm4, %eax
338 ; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
339 ; X86-NEXT: vpinsrd $2, (%eax), %xmm4, %xmm4
340 ; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
341 ; X86-NEXT: .LBB4_14: # %else17
342 ; X86-NEXT: vpextrb $14, %xmm0, %eax
343 ; X86-NEXT: testb $1, %al
344 ; X86-NEXT: je .LBB4_16
345 ; X86-NEXT: # BB#15: # %cond.load19
346 ; X86-NEXT: vextracti128 $1, %ymm3, %xmm3
347 ; X86-NEXT: vpextrd $3, %xmm3, %eax
348 ; X86-NEXT: vextracti128 $1, %ymm2, %xmm3
349 ; X86-NEXT: vpinsrd $3, (%eax), %xmm3, %xmm3
350 ; X86-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
351 ; X86-NEXT: .LBB4_16: # %else20
352156 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
353 ; X86-NEXT: vpslld $31, %ymm0, %ymm0
354 ; X86-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
157 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
158 ; X86-NEXT: vmovdqa (%eax), %ymm2
159 ; X86-NEXT: vpgatherdd %ymm0, (,%ymm2), %ymm1
160 ; X86-NEXT: vmovdqa %ymm1, %ymm0
355161 ; X86-NEXT: retl
356162 ;
357163 ; X64-LABEL: masked_gather_v8i32:
358164 ; X64: # BB#0: # %entry
359 ; X64-NEXT: vmovdqa (%rdi), %ymm4
360 ; X64-NEXT: vmovdqa 32(%rdi), %ymm3
361 ; X64-NEXT: vpextrb $0, %xmm0, %eax
362 ; X64-NEXT: testb $1, %al
363 ; X64-NEXT: # implicit-def: %YMM2
364 ; X64-NEXT: je .LBB4_2
365 ; X64-NEXT: # BB#1: # %cond.load
366 ; X64-NEXT: vmovq %xmm4, %rax
367 ; X64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
368 ; X64-NEXT: .LBB4_2: # %else
369 ; X64-NEXT: vpextrb $2, %xmm0, %eax
370 ; X64-NEXT: testb $1, %al
371 ; X64-NEXT: je .LBB4_4
372 ; X64-NEXT: # BB#3: # %cond.load1
373 ; X64-NEXT: vpextrq $1, %xmm4, %rax
374 ; X64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm5
375 ; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
376 ; X64-NEXT: .LBB4_4: # %else2
377 ; X64-NEXT: vpextrb $4, %xmm0, %eax
378 ; X64-NEXT: testb $1, %al
379 ; X64-NEXT: je .LBB4_6
380 ; X64-NEXT: # BB#5: # %cond.load4
381 ; X64-NEXT: vextracti128 $1, %ymm4, %xmm5
382 ; X64-NEXT: vmovq %xmm5, %rax
383 ; X64-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm5
384 ; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
385 ; X64-NEXT: .LBB4_6: # %else5
386 ; X64-NEXT: vpextrb $6, %xmm0, %eax
387 ; X64-NEXT: testb $1, %al
388 ; X64-NEXT: je .LBB4_8
389 ; X64-NEXT: # BB#7: # %cond.load7
390 ; X64-NEXT: vextracti128 $1, %ymm4, %xmm4
391 ; X64-NEXT: vpextrq $1, %xmm4, %rax
392 ; X64-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm4
393 ; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
394 ; X64-NEXT: .LBB4_8: # %else8
395 ; X64-NEXT: vpextrb $8, %xmm0, %eax
396 ; X64-NEXT: testb $1, %al
397 ; X64-NEXT: je .LBB4_10
398 ; X64-NEXT: # BB#9: # %cond.load10
399 ; X64-NEXT: vmovq %xmm3, %rax
400 ; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
401 ; X64-NEXT: vpinsrd $0, (%rax), %xmm4, %xmm4
402 ; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
403 ; X64-NEXT: .LBB4_10: # %else11
404 ; X64-NEXT: vpextrb $10, %xmm0, %eax
405 ; X64-NEXT: testb $1, %al
406 ; X64-NEXT: je .LBB4_12
407 ; X64-NEXT: # BB#11: # %cond.load13
408 ; X64-NEXT: vpextrq $1, %xmm3, %rax
409 ; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
410 ; X64-NEXT: vpinsrd $1, (%rax), %xmm4, %xmm4
411 ; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
412 ; X64-NEXT: .LBB4_12: # %else14
413 ; X64-NEXT: vpextrb $12, %xmm0, %eax
414 ; X64-NEXT: testb $1, %al
415 ; X64-NEXT: je .LBB4_14
416 ; X64-NEXT: # BB#13: # %cond.load16
417 ; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
418 ; X64-NEXT: vmovq %xmm4, %rax
419 ; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
420 ; X64-NEXT: vpinsrd $2, (%rax), %xmm4, %xmm4
421 ; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
422 ; X64-NEXT: .LBB4_14: # %else17
423 ; X64-NEXT: vpextrb $14, %xmm0, %eax
424 ; X64-NEXT: testb $1, %al
425 ; X64-NEXT: je .LBB4_16
426 ; X64-NEXT: # BB#15: # %cond.load19
427 ; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
428 ; X64-NEXT: vpextrq $1, %xmm3, %rax
429 ; X64-NEXT: vextracti128 $1, %ymm2, %xmm3
430 ; X64-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
431 ; X64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
432 ; X64-NEXT: .LBB4_16: # %else20
433165 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
434166 ; X64-NEXT: vpslld $31, %ymm0, %ymm0
435 ; X64-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
167 ; X64-NEXT: vpsrad $31, %ymm0, %ymm0
168 ; X64-NEXT: vmovdqa (%rdi), %ymm2
169 ; X64-NEXT: vmovdqa 32(%rdi), %ymm3
170 ; X64-NEXT: vextracti128 $1, %ymm1, %xmm4
171 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm5
172 ; X64-NEXT: vpgatherqd %xmm5, (,%ymm3), %xmm4
173 ; X64-NEXT: vpgatherqd %xmm0, (,%ymm2), %xmm1
174 ; X64-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm0
436175 ; X64-NEXT: retq
437176 entry:
438177 %ld = load <8 x i32*>, <8 x i32*>* %ptr
445184 define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <8 x float> %passthro) {
446185 ; X86-LABEL: masked_gather_v8float:
447186 ; X86: # BB#0: # %entry
448 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
449 ; X86-NEXT: vmovdqa (%eax), %ymm3
450 ; X86-NEXT: vpextrb $0, %xmm0, %eax
451 ; X86-NEXT: testb $1, %al
452 ; X86-NEXT: # implicit-def: %YMM2
453 ; X86-NEXT: je .LBB5_2
454 ; X86-NEXT: # BB#1: # %cond.load
455 ; X86-NEXT: vmovd %xmm3, %eax
456 ; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
457 ; X86-NEXT: .LBB5_2: # %else
458 ; X86-NEXT: vpextrb $2, %xmm0, %eax
459 ; X86-NEXT: testb $1, %al
460 ; X86-NEXT: je .LBB5_4
461 ; X86-NEXT: # BB#3: # %cond.load1
462 ; X86-NEXT: vpextrd $1, %xmm3, %eax
463 ; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0],mem[0],xmm2[2,3]
464 ; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
465 ; X86-NEXT: .LBB5_4: # %else2
466 ; X86-NEXT: vpextrb $4, %xmm0, %eax
467 ; X86-NEXT: testb $1, %al
468 ; X86-NEXT: je .LBB5_6
469 ; X86-NEXT: # BB#5: # %cond.load4
470 ; X86-NEXT: vpextrd $2, %xmm3, %eax
471 ; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1],mem[0],xmm2[3]
472 ; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
473 ; X86-NEXT: .LBB5_6: # %else5
474 ; X86-NEXT: vpextrb $6, %xmm0, %eax
475 ; X86-NEXT: testb $1, %al
476 ; X86-NEXT: je .LBB5_8
477 ; X86-NEXT: # BB#7: # %cond.load7
478 ; X86-NEXT: vpextrd $3, %xmm3, %eax
479 ; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
480 ; X86-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
481 ; X86-NEXT: .LBB5_8: # %else8
482 ; X86-NEXT: vpextrb $8, %xmm0, %eax
483 ; X86-NEXT: testb $1, %al
484 ; X86-NEXT: je .LBB5_10
485 ; X86-NEXT: # BB#9: # %cond.load10
486 ; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
487 ; X86-NEXT: vmovd %xmm4, %eax
488 ; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
489 ; X86-NEXT: vextractf128 $1, %ymm2, %xmm5
490 ; X86-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
491 ; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
492 ; X86-NEXT: .LBB5_10: # %else11
493 ; X86-NEXT: vpextrb $10, %xmm0, %eax
494 ; X86-NEXT: testb $1, %al
495 ; X86-NEXT: je .LBB5_12
496 ; X86-NEXT: # BB#11: # %cond.load13
497 ; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
498 ; X86-NEXT: vpextrd $1, %xmm4, %eax
499 ; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
500 ; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
501 ; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
502 ; X86-NEXT: .LBB5_12: # %else14
503 ; X86-NEXT: vpextrb $12, %xmm0, %eax
504 ; X86-NEXT: testb $1, %al
505 ; X86-NEXT: je .LBB5_14
506 ; X86-NEXT: # BB#13: # %cond.load16
507 ; X86-NEXT: vextracti128 $1, %ymm3, %xmm4
508 ; X86-NEXT: vpextrd $2, %xmm4, %eax
509 ; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
510 ; X86-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
511 ; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
512 ; X86-NEXT: .LBB5_14: # %else17
513 ; X86-NEXT: vpextrb $14, %xmm0, %eax
514 ; X86-NEXT: testb $1, %al
515 ; X86-NEXT: je .LBB5_16
516 ; X86-NEXT: # BB#15: # %cond.load19
517 ; X86-NEXT: vextracti128 $1, %ymm3, %xmm3
518 ; X86-NEXT: vpextrd $3, %xmm3, %eax
519 ; X86-NEXT: vextractf128 $1, %ymm2, %xmm3
520 ; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
521 ; X86-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
522 ; X86-NEXT: .LBB5_16: # %else20
523187 ; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
524 ; X86-NEXT: vpslld $31, %ymm0, %ymm0
525 ; X86-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
188 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
189 ; X86-NEXT: vmovaps (%eax), %ymm2
190 ; X86-NEXT: vgatherdps %ymm0, (,%ymm2), %ymm1
191 ; X86-NEXT: vmovaps %ymm1, %ymm0
526192 ; X86-NEXT: retl
527193 ;
528194 ; X64-LABEL: masked_gather_v8float:
529195 ; X64: # BB#0: # %entry
530 ; X64-NEXT: vmovdqa (%rdi), %ymm4
531 ; X64-NEXT: vmovdqa 32(%rdi), %ymm3
532 ; X64-NEXT: vpextrb $0, %xmm0, %eax
533 ; X64-NEXT: testb $1, %al
534 ; X64-NEXT: # implicit-def: %YMM2
535 ; X64-NEXT: je .LBB5_2
536 ; X64-NEXT: # BB#1: # %cond.load
537 ; X64-NEXT: vmovq %xmm4, %rax
538 ; X64-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
539 ; X64-NEXT: .LBB5_2: # %else
540 ; X64-NEXT: vpextrb $2, %xmm0, %eax
541 ; X64-NEXT: testb $1, %al
542 ; X64-NEXT: je .LBB5_4
543 ; X64-NEXT: # BB#3: # %cond.load1
544 ; X64-NEXT: vpextrq $1, %xmm4, %rax
545 ; X64-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0],mem[0],xmm2[2,3]
546 ; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
547 ; X64-NEXT: .LBB5_4: # %else2
548 ; X64-NEXT: vpextrb $4, %xmm0, %eax
549 ; X64-NEXT: testb $1, %al
550 ; X64-NEXT: je .LBB5_6
551 ; X64-NEXT: # BB#5: # %cond.load4
552 ; X64-NEXT: vextracti128 $1, %ymm4, %xmm5
553 ; X64-NEXT: vmovq %xmm5, %rax
554 ; X64-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0,1],mem[0],xmm2[3]
555 ; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
556 ; X64-NEXT: .LBB5_6: # %else5
557 ; X64-NEXT: vpextrb $6, %xmm0, %eax
558 ; X64-NEXT: testb $1, %al
559 ; X64-NEXT: je .LBB5_8
560 ; X64-NEXT: # BB#7: # %cond.load7
561 ; X64-NEXT: vextracti128 $1, %ymm4, %xmm4
562 ; X64-NEXT: vpextrq $1, %xmm4, %rax
563 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
564 ; X64-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
565 ; X64-NEXT: .LBB5_8: # %else8
566 ; X64-NEXT: vpextrb $8, %xmm0, %eax
567 ; X64-NEXT: testb $1, %al
568 ; X64-NEXT: je .LBB5_10
569 ; X64-NEXT: # BB#9: # %cond.load10
570 ; X64-NEXT: vmovq %xmm3, %rax
571 ; X64-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
572 ; X64-NEXT: vextractf128 $1, %ymm2, %xmm5
573 ; X64-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
574 ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
575 ; X64-NEXT: .LBB5_10: # %else11
576 ; X64-NEXT: vpextrb $10, %xmm0, %eax
577 ; X64-NEXT: testb $1, %al
578 ; X64-NEXT: je .LBB5_12
579 ; X64-NEXT: # BB#11: # %cond.load13
580 ; X64-NEXT: vpextrq $1, %xmm3, %rax
581 ; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
582 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
583 ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
584 ; X64-NEXT: .LBB5_12: # %else14
585 ; X64-NEXT: vpextrb $12, %xmm0, %eax
586 ; X64-NEXT: testb $1, %al
587 ; X64-NEXT: je .LBB5_14
588 ; X64-NEXT: # BB#13: # %cond.load16
589 ; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
590 ; X64-NEXT: vmovq %xmm4, %rax
591 ; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
592 ; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
593 ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
594 ; X64-NEXT: .LBB5_14: # %else17
595 ; X64-NEXT: vpextrb $14, %xmm0, %eax
596 ; X64-NEXT: testb $1, %al
597 ; X64-NEXT: je .LBB5_16
598 ; X64-NEXT: # BB#15: # %cond.load19
599 ; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
600 ; X64-NEXT: vpextrq $1, %xmm3, %rax
601 ; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
602 ; X64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
603 ; X64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
604 ; X64-NEXT: .LBB5_16: # %else20
605196 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
606197 ; X64-NEXT: vpslld $31, %ymm0, %ymm0
607 ; X64-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
198 ; X64-NEXT: vpsrad $31, %ymm0, %ymm0
199 ; X64-NEXT: vmovaps (%rdi), %ymm2
200 ; X64-NEXT: vmovaps 32(%rdi), %ymm3
201 ; X64-NEXT: vextractf128 $1, %ymm1, %xmm4
202 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm5
203 ; X64-NEXT: vgatherqps %xmm5, (,%ymm3), %xmm4
204 ; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
205 ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0
608206 ; X64-NEXT: retq
609207 entry:
610208 %ld = load <8 x float*>, <8 x float*>* %ptr
617215 define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i64> %passthro) {
618216 ; X86-LABEL: masked_gather_v4i64:
619217 ; X86: # BB#0: # %entry
620 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
621 ; X86-NEXT: vmovdqa (%eax), %xmm3
622 ; X86-NEXT: vpextrb $0, %xmm0, %eax
623 ; X86-NEXT: testb $1, %al
624 ; X86-NEXT: # implicit-def: %YMM2
625 ; X86-NEXT: je .LBB6_2
626 ; X86-NEXT: # BB#1: # %cond.load
627 ; X86-NEXT: vmovd %xmm3, %eax
628 ; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
629 ; X86-NEXT: .LBB6_2: # %else
630 ; X86-NEXT: vpextrb $4, %xmm0, %eax
631 ; X86-NEXT: testb $1, %al
632 ; X86-NEXT: je .LBB6_4
633 ; X86-NEXT: # BB#3: # %cond.load1
634 ; X86-NEXT: vpextrd $1, %xmm3, %eax
635 ; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm4
636 ; X86-NEXT: vpinsrd $3, 4(%eax), %xmm4, %xmm4
637 ; X86-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
638 ; X86-NEXT: .LBB6_4: # %else2
639 ; X86-NEXT: vpextrb $8, %xmm0, %eax
640 ; X86-NEXT: testb $1, %al
641 ; X86-NEXT: je .LBB6_6
642 ; X86-NEXT: # BB#5: # %cond.load4
643 ; X86-NEXT: vpextrd $2, %xmm3, %eax
644 ; X86-NEXT: vextracti128 $1, %ymm2, %xmm4
645 ; X86-NEXT: vpinsrd $0, (%eax), %xmm4, %xmm4
646 ; X86-NEXT: vpinsrd $1, 4(%eax), %xmm4, %xmm4
647 ; X86-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
648 ; X86-NEXT: .LBB6_6: # %else5
649 ; X86-NEXT: vpextrb $12, %xmm0, %eax
650 ; X86-NEXT: testb $1, %al
651 ; X86-NEXT: je .LBB6_8
652 ; X86-NEXT: # BB#7: # %cond.load7
653 ; X86-NEXT: vpextrd $3, %xmm3, %eax
654 ; X86-NEXT: vextracti128 $1, %ymm2, %xmm3
655 ; X86-NEXT: vpinsrd $2, (%eax), %xmm3, %xmm3
656 ; X86-NEXT: vpinsrd $3, 4(%eax), %xmm3, %xmm3
657 ; X86-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
658 ; X86-NEXT: .LBB6_8: # %else8
659218 ; X86-NEXT: vpslld $31, %xmm0, %xmm0
219 ; X86-NEXT: vpsrad $31, %xmm0, %xmm0
660220 ; X86-NEXT: vpmovsxdq %xmm0, %ymm0
661 ; X86-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
221 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
222 ; X86-NEXT: vmovdqa (%eax), %xmm2
223 ; X86-NEXT: vpgatherdq %ymm0, (,%xmm2), %ymm1
224 ; X86-NEXT: vmovdqa %ymm1, %ymm0
662225 ; X86-NEXT: retl
663226 ;
664227 ; X64-LABEL: masked_gather_v4i64:
665228 ; X64: # BB#0: # %entry
666 ; X64-NEXT: vmovdqa (%rdi), %ymm3
667 ; X64-NEXT: vpextrb $0, %xmm0, %eax
668 ; X64-NEXT: testb $1, %al
669 ; X64-NEXT: # implicit-def: %YMM2
670 ; X64-NEXT: je .LBB6_2
671 ; X64-NEXT: # BB#1: # %cond.load
672 ; X64-NEXT: vmovq %xmm3, %rax
673 ; X64-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
674 ; X64-NEXT: .LBB6_2: # %else
675 ; X64-NEXT: vpextrb $4, %xmm0, %eax
676 ; X64-NEXT: testb $1, %al
677 ; X64-NEXT: je .LBB6_4
678 ; X64-NEXT: # BB#3: # %cond.load1
679 ; X64-NEXT: vpextrq $1, %xmm3, %rax
680 ; X64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm4
681 ; X64-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
682 ; X64-NEXT: .LBB6_4: # %else2
683 ; X64-NEXT: vpextrb $8, %xmm0, %eax
684 ; X64-NEXT: testb $1, %al
685 ; X64-NEXT: je .LBB6_6
686 ; X64-NEXT: # BB#5: # %cond.load4
687 ; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
688 ; X64-NEXT: vmovq %xmm4, %rax
689 ; X64-NEXT: vextracti128 $1, %ymm2, %xmm4
690 ; X64-NEXT: vpinsrq $0, (%rax), %xmm4, %xmm4
691 ; X64-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
692 ; X64-NEXT: .LBB6_6: # %else5
693 ; X64-NEXT: vpextrb $12, %xmm0, %eax
694 ; X64-NEXT: testb $1, %al
695 ; X64-NEXT: je .LBB6_8
696 ; X64-NEXT: # BB#7: # %cond.load7
697 ; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
698 ; X64-NEXT: vpextrq $1, %xmm3, %rax
699 ; X64-NEXT: vextracti128 $1, %ymm2, %xmm3
700 ; X64-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3
701 ; X64-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
702 ; X64-NEXT: .LBB6_8: # %else8
703229 ; X64-NEXT: vpslld $31, %xmm0, %xmm0
230 ; X64-NEXT: vpsrad $31, %xmm0, %xmm0
704231 ; X64-NEXT: vpmovsxdq %xmm0, %ymm0
705 ; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
232 ; X64-NEXT: vmovdqa (%rdi), %ymm2
233 ; X64-NEXT: vpgatherqq %ymm0, (,%ymm2), %ymm1
234 ; X64-NEXT: vmovdqa %ymm1, %ymm0
706235 ; X64-NEXT: retq
707236 entry:
708237 %ld = load <4 x i64*>, <4 x i64*>* %ptr
715244 define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks, <4 x double> %passthro) {
716245 ; X86-LABEL: masked_gather_v4double:
717246 ; X86: # BB#0: # %entry
718 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
719 ; X86-NEXT: vmovdqa (%eax), %xmm3
720 ; X86-NEXT: vpextrb $0, %xmm0, %eax
721 ; X86-NEXT: testb $1, %al
722 ; X86-NEXT: # implicit-def: %YMM2
723 ; X86-NEXT: je .LBB7_2
724 ; X86-NEXT: # BB#1: # %cond.load
725 ; X86-NEXT: vmovd %xmm3, %eax
726 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
727 ; X86-NEXT: .LBB7_2: # %else
728 ; X86-NEXT: vpextrb $4, %xmm0, %eax
729 ; X86-NEXT: testb $1, %al
730 ; X86-NEXT: je .LBB7_4
731 ; X86-NEXT: # BB#3: # %cond.load1
732 ; X86-NEXT: vpextrd $1, %xmm3, %eax
733 ; X86-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
734 ; X86-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
735 ; X86-NEXT: .LBB7_4: # %else2
736 ; X86-NEXT: vpextrb $8, %xmm0, %eax
737 ; X86-NEXT: testb $1, %al
738 ; X86-NEXT: je .LBB7_6
739 ; X86-NEXT: # BB#5: # %cond.load4
740 ; X86-NEXT: vpextrd $2, %xmm3, %eax
741 ; X86-NEXT: vextractf128 $1, %ymm2, %xmm4
742 ; X86-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
743 ; X86-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
744 ; X86-NEXT: .LBB7_6: # %else5
745 ; X86-NEXT: vpextrb $12, %xmm0, %eax
746 ; X86-NEXT: testb $1, %al
747 ; X86-NEXT: je .LBB7_8
748 ; X86-NEXT: # BB#7: # %cond.load7
749 ; X86-NEXT: vpextrd $3, %xmm3, %eax
750 ; X86-NEXT: vextractf128 $1, %ymm2, %xmm3
751 ; X86-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
752 ; X86-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
753 ; X86-NEXT: .LBB7_8: # %else8
754247 ; X86-NEXT: vpslld $31, %xmm0, %xmm0
248 ; X86-NEXT: vpsrad $31, %xmm0, %xmm0
755249 ; X86-NEXT: vpmovsxdq %xmm0, %ymm0
756 ; X86-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
250 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
251 ; X86-NEXT: vmovapd (%eax), %xmm2
252 ; X86-NEXT: vgatherdpd %ymm0, (,%xmm2), %ymm1
253 ; X86-NEXT: vmovapd %ymm1, %ymm0
757254 ; X86-NEXT: retl
758255 ;
759256 ; X64-LABEL: masked_gather_v4double:
760257 ; X64: # BB#0: # %entry
761 ; X64-NEXT: vmovdqa (%rdi), %ymm3
762 ; X64-NEXT: vpextrb $0, %xmm0, %eax
763 ; X64-NEXT: testb $1, %al
764 ; X64-NEXT: # implicit-def: %YMM2
765 ; X64-NEXT: je .LBB7_2
766 ; X64-NEXT: # BB#1: # %cond.load
767 ; X64-NEXT: vmovq %xmm3, %rax
768 ; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
769 ; X64-NEXT: .LBB7_2: # %else
770 ; X64-NEXT: vpextrb $4, %xmm0, %eax
771 ; X64-NEXT: testb $1, %al
772 ; X64-NEXT: je .LBB7_4
773 ; X64-NEXT: # BB#3: # %cond.load1
774 ; X64-NEXT: vpextrq $1, %xmm3, %rax
775 ; X64-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
776 ; X64-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
777 ; X64-NEXT: .LBB7_4: # %else2
778 ; X64-NEXT: vpextrb $8, %xmm0, %eax
779 ; X64-NEXT: testb $1, %al
780 ; X64-NEXT: je .LBB7_6
781 ; X64-NEXT: # BB#5: # %cond.load4
782 ; X64-NEXT: vextracti128 $1, %ymm3, %xmm4
783 ; X64-NEXT: vmovq %xmm4, %rax
784 ; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
785 ; X64-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
786 ; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
787 ; X64-NEXT: .LBB7_6: # %else5
788 ; X64-NEXT: vpextrb $12, %xmm0, %eax
789 ; X64-NEXT: testb $1, %al
790 ; X64-NEXT: je .LBB7_8
791 ; X64-NEXT: # BB#7: # %cond.load7
792 ; X64-NEXT: vextracti128 $1, %ymm3, %xmm3
793 ; X64-NEXT: vpextrq $1, %xmm3, %rax
794 ; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
795 ; X64-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
796 ; X64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
797 ; X64-NEXT: .LBB7_8: # %else8
798258 ; X64-NEXT: vpslld $31, %xmm0, %xmm0
259 ; X64-NEXT: vpsrad $31, %xmm0, %xmm0
799260 ; X64-NEXT: vpmovsxdq %xmm0, %ymm0
800 ; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
261 ; X64-NEXT: vmovapd (%rdi), %ymm2
262 ; X64-NEXT: vgatherqpd %ymm0, (,%ymm2), %ymm1
263 ; X64-NEXT: vmovapd %ymm1, %ymm0
801264 ; X64-NEXT: retq
802265 entry:
803266 %ld = load <4 x double*>, <4 x double*>* %ptr
811274 ; X86-LABEL: masked_gather_v2i64:
812275 ; X86: # BB#0: # %entry
813276 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
814 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
815 ; X86-NEXT: vpextrb $0, %xmm0, %eax
816 ; X86-NEXT: testb $1, %al
817 ; X86-NEXT: # implicit-def: %XMM2
818 ; X86-NEXT: je .LBB8_2
819 ; X86-NEXT: # BB#1: # %cond.load
820 ; X86-NEXT: vmovd %xmm3, %eax
821 ; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
822 ; X86-NEXT: .LBB8_2: # %else
823 ; X86-NEXT: vpextrb $8, %xmm0, %eax
824 ; X86-NEXT: testb $1, %al
825 ; X86-NEXT: je .LBB8_4
826 ; X86-NEXT: # BB#3: # %cond.load1
827 ; X86-NEXT: vpextrd $2, %xmm3, %eax
828 ; X86-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm2
829 ; X86-NEXT: vpinsrd $3, 4(%eax), %xmm2, %xmm2
830 ; X86-NEXT: .LBB8_4: # %else2
831 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0
832 ; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
277 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
278 ; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
279 ; X86-NEXT: vmovdqa %xmm1, %xmm0
833280 ; X86-NEXT: retl
834281 ;
835282 ; X64-LABEL: masked_gather_v2i64:
836283 ; X64: # BB#0: # %entry
837 ; X64-NEXT: vmovdqa (%rdi), %xmm3
838 ; X64-NEXT: vpextrb $0, %xmm0, %eax
839 ; X64-NEXT: testb $1, %al
840 ; X64-NEXT: # implicit-def: %XMM2
841 ; X64-NEXT: je .LBB8_2
842 ; X64-NEXT: # BB#1: # %cond.load
843 ; X64-NEXT: vmovq %xmm3, %rax
844 ; X64-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
845 ; X64-NEXT: .LBB8_2: # %else
846 ; X64-NEXT: vpextrb $8, %xmm0, %eax
847 ; X64-NEXT: testb $1, %al
848 ; X64-NEXT: je .LBB8_4
849 ; X64-NEXT: # BB#3: # %cond.load1
850 ; X64-NEXT: vpextrq $1, %xmm3, %rax
851 ; X64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
852 ; X64-NEXT: .LBB8_4: # %else2
853 ; X64-NEXT: vpsllq $63, %xmm0, %xmm0
854 ; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
284 ; X64-NEXT: vmovdqa (%rdi), %xmm2
285 ; X64-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
286 ; X64-NEXT: vmovdqa %xmm1, %xmm0
855287 ; X64-NEXT: retq
856288 entry:
857289 %ld = load <2 x i64*>, <2 x i64*>* %ptr
865297 ; X86-LABEL: masked_gather_v2double:
866298 ; X86: # BB#0: # %entry
867299 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
868 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
869 ; X86-NEXT: vpextrb $0, %xmm0, %eax
870 ; X86-NEXT: testb $1, %al
871 ; X86-NEXT: # implicit-def: %XMM2
872 ; X86-NEXT: je .LBB9_2
873 ; X86-NEXT: # BB#1: # %cond.load
874 ; X86-NEXT: vmovd %xmm3, %eax
875 ; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
876 ; X86-NEXT: .LBB9_2: # %else
877 ; X86-NEXT: vpextrb $8, %xmm0, %eax
878 ; X86-NEXT: testb $1, %al
879 ; X86-NEXT: je .LBB9_4
880 ; X86-NEXT: # BB#3: # %cond.load1
881 ; X86-NEXT: vpextrd $2, %xmm3, %eax
882 ; X86-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
883 ; X86-NEXT: .LBB9_4: # %else2
884 ; X86-NEXT: vpsllq $63, %xmm0, %xmm0
885 ; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
300 ; X86-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
301 ; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
302 ; X86-NEXT: vmovapd %xmm1, %xmm0
886303 ; X86-NEXT: retl
887304 ;
888305 ; X64-LABEL: masked_gather_v2double:
889306 ; X64: # BB#0: # %entry
890 ; X64-NEXT: vmovdqa (%rdi), %xmm3
891 ; X64-NEXT: vpextrb $0, %xmm0, %eax
892 ; X64-NEXT: testb $1, %al
893 ; X64-NEXT: # implicit-def: %XMM2
894 ; X64-NEXT: je .LBB9_2
895 ; X64-NEXT: # BB#1: # %cond.load
896 ; X64-NEXT: vmovq %xmm3, %rax
897 ; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
898 ; X64-NEXT: .LBB9_2: # %else
899 ; X64-NEXT: vpextrb $8, %xmm0, %eax
900 ; X64-NEXT: testb $1, %al
901 ; X64-NEXT: je .LBB9_4
902 ; X64-NEXT: # BB#3: # %cond.load1
903 ; X64-NEXT: vpextrq $1, %xmm3, %rax
904 ; X64-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
905 ; X64-NEXT: .LBB9_4: # %else2
906 ; X64-NEXT: vpsllq $63, %xmm0, %xmm0
907 ; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
307 ; X64-NEXT: vmovapd (%rdi), %xmm2
308 ; X64-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
309 ; X64-NEXT: vmovapd %xmm1, %xmm0
908310 ; X64-NEXT: retq
909311 entry:
910312 %ld = load <2 x double*>, <2 x double*>* %ptr