llvm.org GIT mirror llvm / 3c0783d
[X86][MMX] Support MMX build vectors to avoid SSE usage (PR29222) 64-bit MMX vector generation usually ends up lowering into SSE instructions before being spilled/reloaded as a MMX type. This patch creates a MMX vector from MMX source values, taking the lowest element from each source and constructing broadcasts/build_vectors with direct calls to the MMX PUNPCKL/PSHUFW intrinsics. We're missing a few consecutive load combines that could be handled in a future patch if that would be useful - my main interest here is just avoiding a lot of the MMX/SSE crossover. Differential Revision: https://reviews.llvm.org/D43618 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327247 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
5 changed file(s) with 658 addition(s) and 1662 deletion(s). Raw diff Collapse all Expand all
3099030990 return SDValue();
3099130991 }
3099230992
30993 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
30994 const X86Subtarget &Subtarget) {
30995 SDLoc DL(N);
30996 unsigned NumElts = N.getNumOperands();
30997
30998 auto *BV = cast(N);
30999 SDValue Splat = BV->getSplatValue();
31000
31001 // Build MMX element from integer GPR or SSE float values.
31002 auto CreateMMXElement = [&](SDValue V) {
31003 if (V.isUndef())
31004 return DAG.getUNDEF(MVT::x86mmx);
31005 if (V.getValueType().isFloatingPoint()) {
31006 if (Subtarget.hasSSE1() && !isa(V)) {
31007 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
31008 V = DAG.getBitcast(MVT::v2i64, V);
31009 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
31010 }
31011 V = DAG.getBitcast(MVT::i32, V);
31012 } else {
31013 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
31014 }
31015 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
31016 };
31017
31018 // Convert build vector ops to MMX data in the bottom elements.
31019 SmallVector Ops;
31020
31021 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
31022 if (Splat) {
31023 if (Splat.isUndef())
31024 return DAG.getUNDEF(MVT::x86mmx);
31025
31026 Splat = CreateMMXElement(Splat);
31027
31028 if (Subtarget.hasSSE1()) {
31029 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
31030 if (NumElts == 8)
31031 Splat = DAG.getNode(
31032 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31033 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
31034 Splat);
31035
31036 // Use PSHUFW to repeat 16-bit elements.
31037 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
31038 return DAG.getNode(
31039 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31040 DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
31041 DAG.getConstant(ShufMask, DL, MVT::i8));
31042 }
31043 Ops.append(NumElts, Splat);
31044 } else {
31045 for (unsigned i = 0; i != NumElts; ++i)
31046 Ops.push_back(CreateMMXElement(N.getOperand(i)));
31047 }
31048
31049 // Use tree of PUNPCKLs to build up general MMX vector.
31050 while (Ops.size() > 1) {
31051 unsigned NumOps = Ops.size();
31052 unsigned IntrinOp =
31053 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
31054 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
31055 : Intrinsic::x86_mmx_punpcklbw));
31056 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
31057 for (unsigned i = 0; i != NumOps; i += 2)
31058 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
31059 Ops[i], Ops[i + 1]);
31060 Ops.resize(NumOps / 2);
31061 }
31062
31063 return Ops[0];
31064 }
31065
3099331066 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
3099431067 TargetLowering::DAGCombinerInfo &DCI,
3099531068 const X86Subtarget &Subtarget) {
3106931142 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
3107031143 }
3107131144 }
31145
31146 // Detect bitcasts of 64-bit build vectors and convert to a
31147 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
31148 // lowest element.
31149 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31150 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
31151 SrcVT == MVT::v8i8))
31152 return createMMXBuildVector(N0, DAG, Subtarget);
3107231153
3107331154 // Detect bitcasts between element or subvector extraction to x86mmx.
3107431155 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
3434 ; X86-NEXT: pushl %ebp
3535 ; X86-NEXT: movl %esp, %ebp
3636 ; X86-NEXT: andl $-8, %esp
37 ; X86-NEXT: subl $16, %esp
38 ; X86-NEXT: flds 12(%ebp)
39 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
40 ; X86-NEXT: flds 8(%ebp)
41 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
42 ; X86-NEXT: pf2id {{[0-9]+}}(%esp), %mm0
37 ; X86-NEXT: subl $8, %esp
38 ; X86-NEXT: movd 12(%ebp), %mm0
39 ; X86-NEXT: movd 8(%ebp), %mm1
40 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
41 ; X86-NEXT: pf2id %mm1, %mm0
4342 ; X86-NEXT: movq %mm0, (%esp)
4443 ; X86-NEXT: movl (%esp), %eax
4544 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
7069 ; X86-NEXT: pushl %ebp
7170 ; X86-NEXT: movl %esp, %ebp
7271 ; X86-NEXT: andl $-8, %esp
73 ; X86-NEXT: subl $24, %esp
74 ; X86-NEXT: flds 12(%ebp)
75 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
76 ; X86-NEXT: flds 8(%ebp)
77 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
78 ; X86-NEXT: flds 20(%ebp)
79 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
80 ; X86-NEXT: flds 16(%ebp)
81 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
82 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
83 ; X86-NEXT: pfacc {{[0-9]+}}(%esp), %mm0
84 ; X86-NEXT: movq %mm0, (%esp)
72 ; X86-NEXT: subl $8, %esp
73 ; X86-NEXT: movd 20(%ebp), %mm0
74 ; X86-NEXT: movd 16(%ebp), %mm1
75 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
76 ; X86-NEXT: movd 12(%ebp), %mm0
77 ; X86-NEXT: movd 8(%ebp), %mm2
78 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
79 ; X86-NEXT: pfacc %mm1, %mm2
80 ; X86-NEXT: movq %mm2, (%esp)
8581 ; X86-NEXT: flds {{[0-9]+}}(%esp)
8682 ; X86-NEXT: flds (%esp)
8783 ; X86-NEXT: movl %ebp, %esp
112108 ; X86-NEXT: pushl %ebp
113109 ; X86-NEXT: movl %esp, %ebp
114110 ; X86-NEXT: andl $-8, %esp
115 ; X86-NEXT: subl $24, %esp
116 ; X86-NEXT: flds 12(%ebp)
117 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
118 ; X86-NEXT: flds 8(%ebp)
119 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
120 ; X86-NEXT: flds 20(%ebp)
121 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
122 ; X86-NEXT: flds 16(%ebp)
123 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
124 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
125 ; X86-NEXT: pfadd {{[0-9]+}}(%esp), %mm0
126 ; X86-NEXT: movq %mm0, (%esp)
111 ; X86-NEXT: subl $8, %esp
112 ; X86-NEXT: movd 20(%ebp), %mm0
113 ; X86-NEXT: movd 16(%ebp), %mm1
114 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
115 ; X86-NEXT: movd 12(%ebp), %mm0
116 ; X86-NEXT: movd 8(%ebp), %mm2
117 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
118 ; X86-NEXT: pfadd %mm1, %mm2
119 ; X86-NEXT: movq %mm2, (%esp)
127120 ; X86-NEXT: flds {{[0-9]+}}(%esp)
128121 ; X86-NEXT: flds (%esp)
129122 ; X86-NEXT: movl %ebp, %esp
154147 ; X86-NEXT: pushl %ebp
155148 ; X86-NEXT: movl %esp, %ebp
156149 ; X86-NEXT: andl $-8, %esp
157 ; X86-NEXT: subl $24, %esp
158 ; X86-NEXT: flds 12(%ebp)
159 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
160 ; X86-NEXT: flds 8(%ebp)
161 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
162 ; X86-NEXT: flds 20(%ebp)
163 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
164 ; X86-NEXT: flds 16(%ebp)
165 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
166 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
167 ; X86-NEXT: pfcmpeq {{[0-9]+}}(%esp), %mm0
168 ; X86-NEXT: movq %mm0, (%esp)
150 ; X86-NEXT: subl $8, %esp
151 ; X86-NEXT: movd 20(%ebp), %mm0
152 ; X86-NEXT: movd 16(%ebp), %mm1
153 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
154 ; X86-NEXT: movd 12(%ebp), %mm0
155 ; X86-NEXT: movd 8(%ebp), %mm2
156 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
157 ; X86-NEXT: pfcmpeq %mm1, %mm2
158 ; X86-NEXT: movq %mm2, (%esp)
169159 ; X86-NEXT: movl (%esp), %eax
170160 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
171161 ; X86-NEXT: movl %ebp, %esp
197187 ; X86-NEXT: pushl %ebp
198188 ; X86-NEXT: movl %esp, %ebp
199189 ; X86-NEXT: andl $-8, %esp
200 ; X86-NEXT: subl $24, %esp
201 ; X86-NEXT: flds 12(%ebp)
202 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
203 ; X86-NEXT: flds 8(%ebp)
204 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
205 ; X86-NEXT: flds 20(%ebp)
206 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
207 ; X86-NEXT: flds 16(%ebp)
208 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
209 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
210 ; X86-NEXT: pfcmpge {{[0-9]+}}(%esp), %mm0
211 ; X86-NEXT: movq %mm0, (%esp)
190 ; X86-NEXT: subl $8, %esp
191 ; X86-NEXT: movd 20(%ebp), %mm0
192 ; X86-NEXT: movd 16(%ebp), %mm1
193 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
194 ; X86-NEXT: movd 12(%ebp), %mm0
195 ; X86-NEXT: movd 8(%ebp), %mm2
196 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
197 ; X86-NEXT: pfcmpge %mm1, %mm2
198 ; X86-NEXT: movq %mm2, (%esp)
212199 ; X86-NEXT: movl (%esp), %eax
213200 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
214201 ; X86-NEXT: movl %ebp, %esp
240227 ; X86-NEXT: pushl %ebp
241228 ; X86-NEXT: movl %esp, %ebp
242229 ; X86-NEXT: andl $-8, %esp
243 ; X86-NEXT: subl $24, %esp
244 ; X86-NEXT: flds 12(%ebp)
245 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
246 ; X86-NEXT: flds 8(%ebp)
247 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
248 ; X86-NEXT: flds 20(%ebp)
249 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
250 ; X86-NEXT: flds 16(%ebp)
251 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
252 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
253 ; X86-NEXT: pfcmpgt {{[0-9]+}}(%esp), %mm0
254 ; X86-NEXT: movq %mm0, (%esp)
230 ; X86-NEXT: subl $8, %esp
231 ; X86-NEXT: movd 20(%ebp), %mm0
232 ; X86-NEXT: movd 16(%ebp), %mm1
233 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
234 ; X86-NEXT: movd 12(%ebp), %mm0
235 ; X86-NEXT: movd 8(%ebp), %mm2
236 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
237 ; X86-NEXT: pfcmpgt %mm1, %mm2
238 ; X86-NEXT: movq %mm2, (%esp)
255239 ; X86-NEXT: movl (%esp), %eax
256240 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
257241 ; X86-NEXT: movl %ebp, %esp
283267 ; X86-NEXT: pushl %ebp
284268 ; X86-NEXT: movl %esp, %ebp
285269 ; X86-NEXT: andl $-8, %esp
286 ; X86-NEXT: subl $24, %esp
287 ; X86-NEXT: flds 12(%ebp)
288 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
289 ; X86-NEXT: flds 8(%ebp)
290 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
291 ; X86-NEXT: flds 20(%ebp)
292 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
293 ; X86-NEXT: flds 16(%ebp)
294 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
295 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
296 ; X86-NEXT: pfmax {{[0-9]+}}(%esp), %mm0
297 ; X86-NEXT: movq %mm0, (%esp)
270 ; X86-NEXT: subl $8, %esp
271 ; X86-NEXT: movd 20(%ebp), %mm0
272 ; X86-NEXT: movd 16(%ebp), %mm1
273 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
274 ; X86-NEXT: movd 12(%ebp), %mm0
275 ; X86-NEXT: movd 8(%ebp), %mm2
276 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
277 ; X86-NEXT: pfmax %mm1, %mm2
278 ; X86-NEXT: movq %mm2, (%esp)
298279 ; X86-NEXT: flds {{[0-9]+}}(%esp)
299280 ; X86-NEXT: flds (%esp)
300281 ; X86-NEXT: movl %ebp, %esp
325306 ; X86-NEXT: pushl %ebp
326307 ; X86-NEXT: movl %esp, %ebp
327308 ; X86-NEXT: andl $-8, %esp
328 ; X86-NEXT: subl $24, %esp
329 ; X86-NEXT: flds 12(%ebp)
330 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
331 ; X86-NEXT: flds 8(%ebp)
332 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
333 ; X86-NEXT: flds 20(%ebp)
334 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
335 ; X86-NEXT: flds 16(%ebp)
336 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
337 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
338 ; X86-NEXT: pfmin {{[0-9]+}}(%esp), %mm0
339 ; X86-NEXT: movq %mm0, (%esp)
309 ; X86-NEXT: subl $8, %esp
310 ; X86-NEXT: movd 20(%ebp), %mm0
311 ; X86-NEXT: movd 16(%ebp), %mm1
312 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
313 ; X86-NEXT: movd 12(%ebp), %mm0
314 ; X86-NEXT: movd 8(%ebp), %mm2
315 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
316 ; X86-NEXT: pfmin %mm1, %mm2
317 ; X86-NEXT: movq %mm2, (%esp)
340318 ; X86-NEXT: flds {{[0-9]+}}(%esp)
341319 ; X86-NEXT: flds (%esp)
342320 ; X86-NEXT: movl %ebp, %esp
367345 ; X86-NEXT: pushl %ebp
368346 ; X86-NEXT: movl %esp, %ebp
369347 ; X86-NEXT: andl $-8, %esp
370 ; X86-NEXT: subl $24, %esp
371 ; X86-NEXT: flds 12(%ebp)
372 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
373 ; X86-NEXT: flds 8(%ebp)
374 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
375 ; X86-NEXT: flds 20(%ebp)
376 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
377 ; X86-NEXT: flds 16(%ebp)
378 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
379 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
380 ; X86-NEXT: pfmul {{[0-9]+}}(%esp), %mm0
381 ; X86-NEXT: movq %mm0, (%esp)
348 ; X86-NEXT: subl $8, %esp
349 ; X86-NEXT: movd 20(%ebp), %mm0
350 ; X86-NEXT: movd 16(%ebp), %mm1
351 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
352 ; X86-NEXT: movd 12(%ebp), %mm0
353 ; X86-NEXT: movd 8(%ebp), %mm2
354 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
355 ; X86-NEXT: pfmul %mm1, %mm2
356 ; X86-NEXT: movq %mm2, (%esp)
382357 ; X86-NEXT: flds {{[0-9]+}}(%esp)
383358 ; X86-NEXT: flds (%esp)
384359 ; X86-NEXT: movl %ebp, %esp
409384 ; X86-NEXT: pushl %ebp
410385 ; X86-NEXT: movl %esp, %ebp
411386 ; X86-NEXT: andl $-8, %esp
412 ; X86-NEXT: subl $16, %esp
413 ; X86-NEXT: flds 12(%ebp)
414 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
415 ; X86-NEXT: flds 8(%ebp)
416 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
417 ; X86-NEXT: pfrcp {{[0-9]+}}(%esp), %mm0
387 ; X86-NEXT: subl $8, %esp
388 ; X86-NEXT: movd 12(%ebp), %mm0
389 ; X86-NEXT: movd 8(%ebp), %mm1
390 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
391 ; X86-NEXT: pfrcp %mm1, %mm0
418392 ; X86-NEXT: movq %mm0, (%esp)
419393 ; X86-NEXT: flds {{[0-9]+}}(%esp)
420394 ; X86-NEXT: flds (%esp)
444418 ; X86-NEXT: pushl %ebp
445419 ; X86-NEXT: movl %esp, %ebp
446420 ; X86-NEXT: andl $-8, %esp
447 ; X86-NEXT: subl $24, %esp
448 ; X86-NEXT: flds 12(%ebp)
449 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
450 ; X86-NEXT: flds 8(%ebp)
451 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
452 ; X86-NEXT: flds 20(%ebp)
453 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
454 ; X86-NEXT: flds 16(%ebp)
455 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
456 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
457 ; X86-NEXT: pfrcpit1 {{[0-9]+}}(%esp), %mm0
458 ; X86-NEXT: movq %mm0, (%esp)
421 ; X86-NEXT: subl $8, %esp
422 ; X86-NEXT: movd 20(%ebp), %mm0
423 ; X86-NEXT: movd 16(%ebp), %mm1
424 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
425 ; X86-NEXT: movd 12(%ebp), %mm0
426 ; X86-NEXT: movd 8(%ebp), %mm2
427 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
428 ; X86-NEXT: pfrcpit1 %mm1, %mm2
429 ; X86-NEXT: movq %mm2, (%esp)
459430 ; X86-NEXT: flds {{[0-9]+}}(%esp)
460431 ; X86-NEXT: flds (%esp)
461432 ; X86-NEXT: movl %ebp, %esp
486457 ; X86-NEXT: pushl %ebp
487458 ; X86-NEXT: movl %esp, %ebp
488459 ; X86-NEXT: andl $-8, %esp
489 ; X86-NEXT: subl $24, %esp
490 ; X86-NEXT: flds 12(%ebp)
491 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
492 ; X86-NEXT: flds 8(%ebp)
493 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
494 ; X86-NEXT: flds 20(%ebp)
495 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
496 ; X86-NEXT: flds 16(%ebp)
497 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
498 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
499 ; X86-NEXT: pfrcpit2 {{[0-9]+}}(%esp), %mm0
500 ; X86-NEXT: movq %mm0, (%esp)
460 ; X86-NEXT: subl $8, %esp
461 ; X86-NEXT: movd 20(%ebp), %mm0
462 ; X86-NEXT: movd 16(%ebp), %mm1
463 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
464 ; X86-NEXT: movd 12(%ebp), %mm0
465 ; X86-NEXT: movd 8(%ebp), %mm2
466 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
467 ; X86-NEXT: pfrcpit2 %mm1, %mm2
468 ; X86-NEXT: movq %mm2, (%esp)
501469 ; X86-NEXT: flds {{[0-9]+}}(%esp)
502470 ; X86-NEXT: flds (%esp)
503471 ; X86-NEXT: movl %ebp, %esp
528496 ; X86-NEXT: pushl %ebp
529497 ; X86-NEXT: movl %esp, %ebp
530498 ; X86-NEXT: andl $-8, %esp
531 ; X86-NEXT: subl $16, %esp
532 ; X86-NEXT: flds 12(%ebp)
533 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
534 ; X86-NEXT: flds 8(%ebp)
535 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
536 ; X86-NEXT: pfrsqrt {{[0-9]+}}(%esp), %mm0
499 ; X86-NEXT: subl $8, %esp
500 ; X86-NEXT: movd 12(%ebp), %mm0
501 ; X86-NEXT: movd 8(%ebp), %mm1
502 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
503 ; X86-NEXT: pfrsqrt %mm1, %mm0
537504 ; X86-NEXT: movq %mm0, (%esp)
538505 ; X86-NEXT: flds {{[0-9]+}}(%esp)
539506 ; X86-NEXT: flds (%esp)
563530 ; X86-NEXT: pushl %ebp
564531 ; X86-NEXT: movl %esp, %ebp
565532 ; X86-NEXT: andl $-8, %esp
566 ; X86-NEXT: subl $24, %esp
567 ; X86-NEXT: flds 12(%ebp)
568 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
569 ; X86-NEXT: flds 8(%ebp)
570 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
571 ; X86-NEXT: flds 20(%ebp)
572 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
573 ; X86-NEXT: flds 16(%ebp)
574 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
575 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
576 ; X86-NEXT: pfrsqit1 {{[0-9]+}}(%esp), %mm0
577 ; X86-NEXT: movq %mm0, (%esp)
533 ; X86-NEXT: subl $8, %esp
534 ; X86-NEXT: movd 20(%ebp), %mm0
535 ; X86-NEXT: movd 16(%ebp), %mm1
536 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
537 ; X86-NEXT: movd 12(%ebp), %mm0
538 ; X86-NEXT: movd 8(%ebp), %mm2
539 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
540 ; X86-NEXT: pfrsqit1 %mm1, %mm2
541 ; X86-NEXT: movq %mm2, (%esp)
578542 ; X86-NEXT: flds {{[0-9]+}}(%esp)
579543 ; X86-NEXT: flds (%esp)
580544 ; X86-NEXT: movl %ebp, %esp
605569 ; X86-NEXT: pushl %ebp
606570 ; X86-NEXT: movl %esp, %ebp
607571 ; X86-NEXT: andl $-8, %esp
608 ; X86-NEXT: subl $24, %esp
609 ; X86-NEXT: flds 12(%ebp)
610 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
611 ; X86-NEXT: flds 8(%ebp)
612 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
613 ; X86-NEXT: flds 20(%ebp)
614 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
615 ; X86-NEXT: flds 16(%ebp)
616 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
617 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
618 ; X86-NEXT: pfsub {{[0-9]+}}(%esp), %mm0
619 ; X86-NEXT: movq %mm0, (%esp)
572 ; X86-NEXT: subl $8, %esp
573 ; X86-NEXT: movd 20(%ebp), %mm0
574 ; X86-NEXT: movd 16(%ebp), %mm1
575 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
576 ; X86-NEXT: movd 12(%ebp), %mm0
577 ; X86-NEXT: movd 8(%ebp), %mm2
578 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
579 ; X86-NEXT: pfsub %mm1, %mm2
580 ; X86-NEXT: movq %mm2, (%esp)
620581 ; X86-NEXT: flds {{[0-9]+}}(%esp)
621582 ; X86-NEXT: flds (%esp)
622583 ; X86-NEXT: movl %ebp, %esp
647608 ; X86-NEXT: pushl %ebp
648609 ; X86-NEXT: movl %esp, %ebp
649610 ; X86-NEXT: andl $-8, %esp
650 ; X86-NEXT: subl $24, %esp
651 ; X86-NEXT: flds 12(%ebp)
652 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
653 ; X86-NEXT: flds 8(%ebp)
654 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
655 ; X86-NEXT: flds 20(%ebp)
656 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
657 ; X86-NEXT: flds 16(%ebp)
658 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
659 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
660 ; X86-NEXT: pfsubr {{[0-9]+}}(%esp), %mm0
661 ; X86-NEXT: movq %mm0, (%esp)
611 ; X86-NEXT: subl $8, %esp
612 ; X86-NEXT: movd 20(%ebp), %mm0
613 ; X86-NEXT: movd 16(%ebp), %mm1
614 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
615 ; X86-NEXT: movd 12(%ebp), %mm0
616 ; X86-NEXT: movd 8(%ebp), %mm2
617 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
618 ; X86-NEXT: pfsubr %mm1, %mm2
619 ; X86-NEXT: movq %mm2, (%esp)
662620 ; X86-NEXT: flds {{[0-9]+}}(%esp)
663621 ; X86-NEXT: flds (%esp)
664622 ; X86-NEXT: movl %ebp, %esp
747705 ; X86-NEXT: pushl %ebp
748706 ; X86-NEXT: movl %esp, %ebp
749707 ; X86-NEXT: andl $-8, %esp
750 ; X86-NEXT: subl $16, %esp
751 ; X86-NEXT: flds 12(%ebp)
752 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
753 ; X86-NEXT: flds 8(%ebp)
754 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
755 ; X86-NEXT: pf2iw {{[0-9]+}}(%esp), %mm0
708 ; X86-NEXT: subl $8, %esp
709 ; X86-NEXT: movd 12(%ebp), %mm0
710 ; X86-NEXT: movd 8(%ebp), %mm1
711 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
712 ; X86-NEXT: pf2iw %mm1, %mm0
756713 ; X86-NEXT: movq %mm0, (%esp)
757714 ; X86-NEXT: movl (%esp), %eax
758715 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
783740 ; X86-NEXT: pushl %ebp
784741 ; X86-NEXT: movl %esp, %ebp
785742 ; X86-NEXT: andl $-8, %esp
786 ; X86-NEXT: subl $24, %esp
787 ; X86-NEXT: flds 12(%ebp)
788 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
789 ; X86-NEXT: flds 8(%ebp)
790 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
791 ; X86-NEXT: flds 20(%ebp)
792 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
793 ; X86-NEXT: flds 16(%ebp)
794 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
795 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
796 ; X86-NEXT: pfnacc {{[0-9]+}}(%esp), %mm0
797 ; X86-NEXT: movq %mm0, (%esp)
743 ; X86-NEXT: subl $8, %esp
744 ; X86-NEXT: movd 20(%ebp), %mm0
745 ; X86-NEXT: movd 16(%ebp), %mm1
746 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
747 ; X86-NEXT: movd 12(%ebp), %mm0
748 ; X86-NEXT: movd 8(%ebp), %mm2
749 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
750 ; X86-NEXT: pfnacc %mm1, %mm2
751 ; X86-NEXT: movq %mm2, (%esp)
798752 ; X86-NEXT: flds {{[0-9]+}}(%esp)
799753 ; X86-NEXT: flds (%esp)
800754 ; X86-NEXT: movl %ebp, %esp
825779 ; X86-NEXT: pushl %ebp
826780 ; X86-NEXT: movl %esp, %ebp
827781 ; X86-NEXT: andl $-8, %esp
828 ; X86-NEXT: subl $24, %esp
829 ; X86-NEXT: flds 12(%ebp)
830 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
831 ; X86-NEXT: flds 8(%ebp)
832 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
833 ; X86-NEXT: flds 20(%ebp)
834 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
835 ; X86-NEXT: flds 16(%ebp)
836 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
837 ; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
838 ; X86-NEXT: pfpnacc {{[0-9]+}}(%esp), %mm0
839 ; X86-NEXT: movq %mm0, (%esp)
782 ; X86-NEXT: subl $8, %esp
783 ; X86-NEXT: movd 20(%ebp), %mm0
784 ; X86-NEXT: movd 16(%ebp), %mm1
785 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
786 ; X86-NEXT: movd 12(%ebp), %mm0
787 ; X86-NEXT: movd 8(%ebp), %mm2
788 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
789 ; X86-NEXT: pfpnacc %mm1, %mm2
790 ; X86-NEXT: movq %mm2, (%esp)
840791 ; X86-NEXT: flds {{[0-9]+}}(%esp)
841792 ; X86-NEXT: flds (%esp)
842793 ; X86-NEXT: movl %ebp, %esp
898849 ; X86-NEXT: pushl %ebp
899850 ; X86-NEXT: movl %esp, %ebp
900851 ; X86-NEXT: andl $-8, %esp
901 ; X86-NEXT: subl $16, %esp
902 ; X86-NEXT: flds 12(%ebp)
903 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
904 ; X86-NEXT: flds 8(%ebp)
905 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
906 ; X86-NEXT: pswapd {{[0-9]+}}(%esp), %mm0 # mm0 = mem[1,0]
852 ; X86-NEXT: subl $8, %esp
853 ; X86-NEXT: movd 12(%ebp), %mm0
854 ; X86-NEXT: movd 8(%ebp), %mm1
855 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
856 ; X86-NEXT: pswapd %mm1, %mm0 # mm0 = mm1[1,0]
907857 ; X86-NEXT: movq %mm0, (%esp)
908858 ; X86-NEXT: flds {{[0-9]+}}(%esp)
909859 ; X86-NEXT: flds (%esp)
931881 ; X86-NEXT: pushl %ebp
932882 ; X86-NEXT: movl %esp, %ebp
933883 ; X86-NEXT: andl $-8, %esp
934 ; X86-NEXT: subl $16, %esp
935 ; X86-NEXT: movl 12(%ebp), %eax
936 ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
937 ; X86-NEXT: movl 8(%ebp), %eax
938 ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
939 ; X86-NEXT: pswapd {{[0-9]+}}(%esp), %mm0 # mm0 = mem[1,0]
884 ; X86-NEXT: subl $8, %esp
885 ; X86-NEXT: movd 12(%ebp), %mm0
886 ; X86-NEXT: movd 8(%ebp), %mm1
887 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
888 ; X86-NEXT: pswapd %mm1, %mm0 # mm0 = mm1[1,0]
940889 ; X86-NEXT: movq %mm0, (%esp)
941890 ; X86-NEXT: movl (%esp), %eax
942891 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
1414 ;
1515
1616 define void @build_v2i32_01(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
17 ; X86-MMX-LABEL: build_v2i32_01:
18 ; X86-MMX: # %bb.0:
19 ; X86-MMX-NEXT: pushl %ebp
20 ; X86-MMX-NEXT: movl %esp, %ebp
21 ; X86-MMX-NEXT: andl $-8, %esp
22 ; X86-MMX-NEXT: subl $8, %esp
23 ; X86-MMX-NEXT: movl 8(%ebp), %eax
24 ; X86-MMX-NEXT: movl 12(%ebp), %ecx
25 ; X86-MMX-NEXT: movl 16(%ebp), %edx
26 ; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
27 ; X86-MMX-NEXT: movl %ecx, (%esp)
28 ; X86-MMX-NEXT: movq (%esp), %mm0
29 ; X86-MMX-NEXT: paddd %mm0, %mm0
30 ; X86-MMX-NEXT: movq %mm0, (%eax)
31 ; X86-MMX-NEXT: movl %ebp, %esp
32 ; X86-MMX-NEXT: popl %ebp
33 ; X86-MMX-NEXT: retl
34 ;
35 ; X86-SSE-LABEL: build_v2i32_01:
36 ; X86-SSE: # %bb.0:
37 ; X86-SSE-NEXT: pushl %ebp
38 ; X86-SSE-NEXT: movl %esp, %ebp
39 ; X86-SSE-NEXT: andl $-8, %esp
40 ; X86-SSE-NEXT: subl $8, %esp
41 ; X86-SSE-NEXT: movl 8(%ebp), %eax
42 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
43 ; X86-SSE-NEXT: movlps %xmm0, (%esp)
44 ; X86-SSE-NEXT: movq (%esp), %mm0
45 ; X86-SSE-NEXT: paddd %mm0, %mm0
46 ; X86-SSE-NEXT: movq %mm0, (%eax)
47 ; X86-SSE-NEXT: movl %ebp, %esp
48 ; X86-SSE-NEXT: popl %ebp
49 ; X86-SSE-NEXT: retl
50 ;
51 ; X64-SSE-LABEL: build_v2i32_01:
52 ; X64-SSE: # %bb.0:
53 ; X64-SSE-NEXT: movd %edx, %xmm0
54 ; X64-SSE-NEXT: movd %esi, %xmm1
55 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
56 ; X64-SSE-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
57 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
58 ; X64-SSE-NEXT: paddd %mm0, %mm0
59 ; X64-SSE-NEXT: movq %mm0, (%rdi)
60 ; X64-SSE-NEXT: retq
61 ;
62 ; X64-AVX-LABEL: build_v2i32_01:
63 ; X64-AVX: # %bb.0:
64 ; X64-AVX-NEXT: vmovd %esi, %xmm0
65 ; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
66 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
67 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
68 ; X64-AVX-NEXT: paddd %mm0, %mm0
69 ; X64-AVX-NEXT: movq %mm0, (%rdi)
70 ; X64-AVX-NEXT: retq
17 ; X86-LABEL: build_v2i32_01:
18 ; X86: # %bb.0:
19 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
20 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
21 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
22 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
23 ; X86-NEXT: paddd %mm1, %mm1
24 ; X86-NEXT: movq %mm1, (%eax)
25 ; X86-NEXT: retl
26 ;
27 ; X64-LABEL: build_v2i32_01:
28 ; X64: # %bb.0:
29 ; X64-NEXT: movd %edx, %mm0
30 ; X64-NEXT: movd %esi, %mm1
31 ; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
32 ; X64-NEXT: paddd %mm1, %mm1
33 ; X64-NEXT: movq %mm1, (%rdi)
34 ; X64-NEXT: retq
7135 %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
7236 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1
7337 %3 = bitcast <2 x i32> %2 to x86_mmx
10266 define void @build_v2i32_u1(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
10367 ; X86-MMX-LABEL: build_v2i32_u1:
10468 ; X86-MMX: # %bb.0:
105 ; X86-MMX-NEXT: pushl %ebp
106 ; X86-MMX-NEXT: movl %esp, %ebp
107 ; X86-MMX-NEXT: andl $-8, %esp
108 ; X86-MMX-NEXT: subl $8, %esp
109 ; X86-MMX-NEXT: movl 8(%ebp), %eax
110 ; X86-MMX-NEXT: movl 16(%ebp), %ecx
111 ; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
112 ; X86-MMX-NEXT: movq (%esp), %mm0
69 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
70 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
71 ; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
11372 ; X86-MMX-NEXT: paddd %mm0, %mm0
11473 ; X86-MMX-NEXT: movq %mm0, (%eax)
115 ; X86-MMX-NEXT: movl %ebp, %esp
116 ; X86-MMX-NEXT: popl %ebp
11774 ; X86-MMX-NEXT: retl
11875 ;
11976 ; X86-SSE-LABEL: build_v2i32_u1:
12077 ; X86-SSE: # %bb.0:
121 ; X86-SSE-NEXT: pushl %ebp
122 ; X86-SSE-NEXT: movl %esp, %ebp
123 ; X86-SSE-NEXT: andl $-8, %esp
124 ; X86-SSE-NEXT: subl $8, %esp
125 ; X86-SSE-NEXT: movl 8(%ebp), %eax
126 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
127 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
128 ; X86-SSE-NEXT: movq %xmm0, (%esp)
129 ; X86-SSE-NEXT: movq (%esp), %mm0
78 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
79 ; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0
80 ; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
13081 ; X86-SSE-NEXT: paddd %mm0, %mm0
13182 ; X86-SSE-NEXT: movq %mm0, (%eax)
132 ; X86-SSE-NEXT: movl %ebp, %esp
133 ; X86-SSE-NEXT: popl %ebp
13483 ; X86-SSE-NEXT: retl
13584 ;
136 ; X64-SSE-LABEL: build_v2i32_u1:
137 ; X64-SSE: # %bb.0:
138 ; X64-SSE-NEXT: movd %edx, %xmm0
139 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
140 ; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
141 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
142 ; X64-SSE-NEXT: paddd %mm0, %mm0
143 ; X64-SSE-NEXT: movq %mm0, (%rdi)
144 ; X64-SSE-NEXT: retq
145 ;
146 ; X64-AVX1-LABEL: build_v2i32_u1:
147 ; X64-AVX1: # %bb.0:
148 ; X64-AVX1-NEXT: vmovd %edx, %xmm0
149 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
150 ; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
151 ; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
152 ; X64-AVX1-NEXT: paddd %mm0, %mm0
153 ; X64-AVX1-NEXT: movq %mm0, (%rdi)
154 ; X64-AVX1-NEXT: retq
155 ;
156 ; X64-AVX2-LABEL: build_v2i32_u1:
157 ; X64-AVX2: # %bb.0:
158 ; X64-AVX2-NEXT: vmovd %edx, %xmm0
159 ; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
160 ; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
161 ; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
162 ; X64-AVX2-NEXT: paddd %mm0, %mm0
163 ; X64-AVX2-NEXT: movq %mm0, (%rdi)
164 ; X64-AVX2-NEXT: retq
165 ;
166 ; X64-AVX512-LABEL: build_v2i32_u1:
167 ; X64-AVX512: # %bb.0:
168 ; X64-AVX512-NEXT: vmovd %edx, %xmm0
169 ; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
170 ; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
171 ; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
172 ; X64-AVX512-NEXT: paddd %mm0, %mm0
173 ; X64-AVX512-NEXT: movq %mm0, (%rdi)
174 ; X64-AVX512-NEXT: retq
85 ; X64-LABEL: build_v2i32_u1:
86 ; X64: # %bb.0:
87 ; X64-NEXT: movd %edx, %mm0
88 ; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
89 ; X64-NEXT: paddd %mm0, %mm0
90 ; X64-NEXT: movq %mm0, (%rdi)
91 ; X64-NEXT: retq
17592 %1 = insertelement <2 x i32> undef, i32 undef, i32 0
17693 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1
17794 %3 = bitcast <2 x i32> %2 to x86_mmx
18198 }
18299
183100 define void @build_v2i32_z1(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
184 ; X86-MMX-LABEL: build_v2i32_z1:
185 ; X86-MMX: # %bb.0:
186 ; X86-MMX-NEXT: pushl %ebp
187 ; X86-MMX-NEXT: movl %esp, %ebp
188 ; X86-MMX-NEXT: andl $-8, %esp
189 ; X86-MMX-NEXT: subl $8, %esp
190 ; X86-MMX-NEXT: movl 8(%ebp), %eax
191 ; X86-MMX-NEXT: movl 16(%ebp), %ecx
192 ; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
193 ; X86-MMX-NEXT: movl $0, (%esp)
194 ; X86-MMX-NEXT: movq (%esp), %mm0
195 ; X86-MMX-NEXT: paddd %mm0, %mm0
196 ; X86-MMX-NEXT: movq %mm0, (%eax)
197 ; X86-MMX-NEXT: movl %ebp, %esp
198 ; X86-MMX-NEXT: popl %ebp
199 ; X86-MMX-NEXT: retl
200 ;
201 ; X86-SSE-LABEL: build_v2i32_z1:
202 ; X86-SSE: # %bb.0:
203 ; X86-SSE-NEXT: pushl %ebp
204 ; X86-SSE-NEXT: movl %esp, %ebp
205 ; X86-SSE-NEXT: andl $-8, %esp
206 ; X86-SSE-NEXT: subl $8, %esp
207 ; X86-SSE-NEXT: movl 8(%ebp), %eax
208 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
209 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
210 ; X86-SSE-NEXT: movq %xmm0, (%esp)
211 ; X86-SSE-NEXT: movq (%esp), %mm0
212 ; X86-SSE-NEXT: paddd %mm0, %mm0
213 ; X86-SSE-NEXT: movq %mm0, (%eax)
214 ; X86-SSE-NEXT: movl %ebp, %esp
215 ; X86-SSE-NEXT: popl %ebp
216 ; X86-SSE-NEXT: retl
217 ;
218 ; X64-SSE-LABEL: build_v2i32_z1:
219 ; X64-SSE: # %bb.0:
220 ; X64-SSE-NEXT: # kill: def $edx killed $edx def $rdx
221 ; X64-SSE-NEXT: movq %rdx, %xmm0
222 ; X64-SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
223 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
224 ; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
225 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
226 ; X64-SSE-NEXT: paddd %mm0, %mm0
227 ; X64-SSE-NEXT: movq %mm0, (%rdi)
228 ; X64-SSE-NEXT: retq
229 ;
230 ; X64-AVX-LABEL: build_v2i32_z1:
231 ; X64-AVX: # %bb.0:
232 ; X64-AVX-NEXT: # kill: def $edx killed $edx def $rdx
233 ; X64-AVX-NEXT: vmovq %rdx, %xmm0
234 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
235 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
236 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
237 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
238 ; X64-AVX-NEXT: paddd %mm0, %mm0
239 ; X64-AVX-NEXT: movq %mm0, (%rdi)
240 ; X64-AVX-NEXT: retq
101 ; X86-LABEL: build_v2i32_z1:
102 ; X86: # %bb.0:
103 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
104 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
105 ; X86-NEXT: pxor %mm1, %mm1
106 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
107 ; X86-NEXT: paddd %mm1, %mm1
108 ; X86-NEXT: movq %mm1, (%eax)
109 ; X86-NEXT: retl
110 ;
111 ; X64-LABEL: build_v2i32_z1:
112 ; X64: # %bb.0:
113 ; X64-NEXT: movd %edx, %mm0
114 ; X64-NEXT: pxor %mm1, %mm1
115 ; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
116 ; X64-NEXT: paddd %mm1, %mm1
117 ; X64-NEXT: movq %mm1, (%rdi)
118 ; X64-NEXT: retq
241119 %1 = insertelement <2 x i32> undef, i32 0, i32 0
242120 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1
243121 %3 = bitcast <2 x i32> %2 to x86_mmx
249127 define void @build_v2i32_00(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
250128 ; X86-MMX-LABEL: build_v2i32_00:
251129 ; X86-MMX: # %bb.0:
252 ; X86-MMX-NEXT: pushl %ebp
253 ; X86-MMX-NEXT: movl %esp, %ebp
254 ; X86-MMX-NEXT: andl $-8, %esp
255 ; X86-MMX-NEXT: subl $8, %esp
256 ; X86-MMX-NEXT: movl 8(%ebp), %eax
257 ; X86-MMX-NEXT: movl 12(%ebp), %ecx
258 ; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
259 ; X86-MMX-NEXT: movl %ecx, (%esp)
260 ; X86-MMX-NEXT: movq (%esp), %mm0
130 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
131 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
132 ; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
261133 ; X86-MMX-NEXT: paddd %mm0, %mm0
262134 ; X86-MMX-NEXT: movq %mm0, (%eax)
263 ; X86-MMX-NEXT: movl %ebp, %esp
264 ; X86-MMX-NEXT: popl %ebp
265135 ; X86-MMX-NEXT: retl
266136 ;
267137 ; X86-SSE-LABEL: build_v2i32_00:
268138 ; X86-SSE: # %bb.0:
269 ; X86-SSE-NEXT: pushl %ebp
270 ; X86-SSE-NEXT: movl %esp, %ebp
271 ; X86-SSE-NEXT: andl $-8, %esp
272 ; X86-SSE-NEXT: subl $8, %esp
273 ; X86-SSE-NEXT: movl 8(%ebp), %eax
274 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
275 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
276 ; X86-SSE-NEXT: movq %xmm0, (%esp)
277 ; X86-SSE-NEXT: movq (%esp), %mm0
139 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
140 ; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0
141 ; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
278142 ; X86-SSE-NEXT: paddd %mm0, %mm0
279143 ; X86-SSE-NEXT: movq %mm0, (%eax)
280 ; X86-SSE-NEXT: movl %ebp, %esp
281 ; X86-SSE-NEXT: popl %ebp
282144 ; X86-SSE-NEXT: retl
283145 ;
284 ; X64-SSE-LABEL: build_v2i32_00:
285 ; X64-SSE: # %bb.0:
286 ; X64-SSE-NEXT: movd %esi, %xmm0
287 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
288 ; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
289 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
290 ; X64-SSE-NEXT: paddd %mm0, %mm0
291 ; X64-SSE-NEXT: movq %mm0, (%rdi)
292 ; X64-SSE-NEXT: retq
293 ;
294 ; X64-AVX1-LABEL: build_v2i32_00:
295 ; X64-AVX1: # %bb.0:
296 ; X64-AVX1-NEXT: vmovd %esi, %xmm0
297 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
298 ; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
299 ; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
300 ; X64-AVX1-NEXT: paddd %mm0, %mm0
301 ; X64-AVX1-NEXT: movq %mm0, (%rdi)
302 ; X64-AVX1-NEXT: retq
303 ;
304 ; X64-AVX2-LABEL: build_v2i32_00:
305 ; X64-AVX2: # %bb.0:
306 ; X64-AVX2-NEXT: vmovd %esi, %xmm0
307 ; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
308 ; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
309 ; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
310 ; X64-AVX2-NEXT: paddd %mm0, %mm0
311 ; X64-AVX2-NEXT: movq %mm0, (%rdi)
312 ; X64-AVX2-NEXT: retq
313 ;
314 ; X64-AVX512-LABEL: build_v2i32_00:
315 ; X64-AVX512: # %bb.0:
316 ; X64-AVX512-NEXT: vmovd %esi, %xmm0
317 ; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
318 ; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
319 ; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
320 ; X64-AVX512-NEXT: paddd %mm0, %mm0
321 ; X64-AVX512-NEXT: movq %mm0, (%rdi)
322 ; X64-AVX512-NEXT: retq
146 ; X64-LABEL: build_v2i32_00:
147 ; X64: # %bb.0:
148 ; X64-NEXT: movd %esi, %mm0
149 ; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
150 ; X64-NEXT: paddd %mm0, %mm0
151 ; X64-NEXT: movq %mm0, (%rdi)
152 ; X64-NEXT: retq
323153 %1 = insertelement <2 x i32> undef, i32 %a0, i32 0
324154 %2 = insertelement <2 x i32> %1, i32 %a0, i32 1
325155 %3 = bitcast <2 x i32> %2 to x86_mmx
333163 ;
334164
335165 define void @build_v4i16_0123(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
336 ; X86-MMX-LABEL: build_v4i16_0123:
337 ; X86-MMX: # %bb.0:
338 ; X86-MMX-NEXT: pushl %ebp
339 ; X86-MMX-NEXT: movl %esp, %ebp
340 ; X86-MMX-NEXT: andl $-8, %esp
341 ; X86-MMX-NEXT: subl $8, %esp
342 ; X86-MMX-NEXT: movl 8(%ebp), %eax
343 ; X86-MMX-NEXT: movl 24(%ebp), %ecx
344 ; X86-MMX-NEXT: shll $16, %ecx
345 ; X86-MMX-NEXT: movzwl 20(%ebp), %edx
346 ; X86-MMX-NEXT: orl %ecx, %edx
347 ; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
348 ; X86-MMX-NEXT: movl 16(%ebp), %ecx
349 ; X86-MMX-NEXT: shll $16, %ecx
350 ; X86-MMX-NEXT: movzwl 12(%ebp), %edx
351 ; X86-MMX-NEXT: orl %ecx, %edx
352 ; X86-MMX-NEXT: movl %edx, (%esp)
353 ; X86-MMX-NEXT: movq (%esp), %mm0
354 ; X86-MMX-NEXT: paddd %mm0, %mm0
355 ; X86-MMX-NEXT: movq %mm0, (%eax)
356 ; X86-MMX-NEXT: movl %ebp, %esp
357 ; X86-MMX-NEXT: popl %ebp
358 ; X86-MMX-NEXT: retl
359 ;
360 ; X86-SSE-LABEL: build_v4i16_0123:
361 ; X86-SSE: # %bb.0:
362 ; X86-SSE-NEXT: pushl %ebp
363 ; X86-SSE-NEXT: movl %esp, %ebp
364 ; X86-SSE-NEXT: andl $-8, %esp
365 ; X86-SSE-NEXT: subl $8, %esp
366 ; X86-SSE-NEXT: movl 8(%ebp), %eax
367 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
368 ; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0
369 ; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0
370 ; X86-SSE-NEXT: pinsrw $3, 24(%ebp), %xmm0
371 ; X86-SSE-NEXT: movq %xmm0, (%esp)
372 ; X86-SSE-NEXT: movq (%esp), %mm0
373 ; X86-SSE-NEXT: paddd %mm0, %mm0
374 ; X86-SSE-NEXT: movq %mm0, (%eax)
375 ; X86-SSE-NEXT: movl %ebp, %esp
376 ; X86-SSE-NEXT: popl %ebp
377 ; X86-SSE-NEXT: retl
378 ;
379 ; X64-SSE2-LABEL: build_v4i16_0123:
380 ; X64-SSE2: # %bb.0:
381 ; X64-SSE2-NEXT: movd %r8d, %xmm0
382 ; X64-SSE2-NEXT: movd %ecx, %xmm1
383 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
384 ; X64-SSE2-NEXT: movd %edx, %xmm0
385 ; X64-SSE2-NEXT: movd %esi, %xmm2
386 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
387 ; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
388 ; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
389 ; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
390 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
391 ; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
392 ; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
393 ; X64-SSE2-NEXT: paddd %mm0, %mm0
394 ; X64-SSE2-NEXT: movq %mm0, (%rdi)
395 ; X64-SSE2-NEXT: retq
396 ;
397 ; X64-SSSE3-LABEL: build_v4i16_0123:
398 ; X64-SSSE3: # %bb.0:
399 ; X64-SSSE3-NEXT: movd %r8d, %xmm0
400 ; X64-SSSE3-NEXT: movd %ecx, %xmm1
401 ; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
402 ; X64-SSSE3-NEXT: movd %edx, %xmm0
403 ; X64-SSSE3-NEXT: movd %esi, %xmm2
404 ; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
405 ; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
406 ; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
407 ; X64-SSSE3-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
408 ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
409 ; X64-SSSE3-NEXT: paddd %mm0, %mm0
410 ; X64-SSSE3-NEXT: movq %mm0, (%rdi)
411 ; X64-SSSE3-NEXT: retq
412 ;
413 ; X64-AVX-LABEL: build_v4i16_0123:
414 ; X64-AVX: # %bb.0:
415 ; X64-AVX-NEXT: vmovd %esi, %xmm0
416 ; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
417 ; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
418 ; X64-AVX-NEXT: vpinsrd $3, %r8d, %xmm0, %xmm0
419 ; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
420 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
421 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
422 ; X64-AVX-NEXT: paddd %mm0, %mm0
423 ; X64-AVX-NEXT: movq %mm0, (%rdi)
424 ; X64-AVX-NEXT: retq
166 ; X86-LABEL: build_v4i16_0123:
167 ; X86: # %bb.0:
168 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
169 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
170 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
171 ; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
172 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
173 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
174 ; X86-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1]
175 ; X86-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0]
176 ; X86-NEXT: paddd %mm2, %mm2
177 ; X86-NEXT: movq %mm2, (%eax)
178 ; X86-NEXT: retl
179 ;
180 ; X64-LABEL: build_v4i16_0123:
181 ; X64: # %bb.0:
182 ; X64-NEXT: movd %r8d, %mm0
183 ; X64-NEXT: movd %ecx, %mm1
184 ; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
185 ; X64-NEXT: movd %edx, %mm0
186 ; X64-NEXT: movd %esi, %mm2
187 ; X64-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1]
188 ; X64-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0]
189 ; X64-NEXT: paddd %mm2, %mm2
190 ; X64-NEXT: movq %mm2, (%rdi)
191 ; X64-NEXT: retq
425192 %1 = insertelement <4 x i16> undef, i16 %a0, i32 0
426193 %2 = insertelement <4 x i16> %1, i16 %a1, i32 1
427194 %3 = insertelement <4 x i16> %2, i16 %a2, i32 2
433200 }
434201
435202 define void @build_v4i16_01zz(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
436 ; X86-MMX-LABEL: build_v4i16_01zz:
437 ; X86-MMX: # %bb.0:
438 ; X86-MMX-NEXT: pushl %ebp
439 ; X86-MMX-NEXT: movl %esp, %ebp
440 ; X86-MMX-NEXT: andl $-8, %esp
441 ; X86-MMX-NEXT: subl $8, %esp
442 ; X86-MMX-NEXT: movl 8(%ebp), %eax
443 ; X86-MMX-NEXT: movl 16(%ebp), %ecx
444 ; X86-MMX-NEXT: shll $16, %ecx
445 ; X86-MMX-NEXT: movzwl 12(%ebp), %edx
446 ; X86-MMX-NEXT: orl %ecx, %edx
447 ; X86-MMX-NEXT: movl %edx, (%esp)
448 ; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
449 ; X86-MMX-NEXT: movq (%esp), %mm0
450 ; X86-MMX-NEXT: paddd %mm0, %mm0
451 ; X86-MMX-NEXT: movq %mm0, (%eax)
452 ; X86-MMX-NEXT: movl %ebp, %esp
453 ; X86-MMX-NEXT: popl %ebp
454 ; X86-MMX-NEXT: retl
455 ;
456 ; X86-SSE2-LABEL: build_v4i16_01zz:
457 ; X86-SSE2: # %bb.0:
458 ; X86-SSE2-NEXT: pushl %ebp
459 ; X86-SSE2-NEXT: movl %esp, %ebp
460 ; X86-SSE2-NEXT: andl $-8, %esp
461 ; X86-SSE2-NEXT: subl $8, %esp
462 ; X86-SSE2-NEXT: movl 8(%ebp), %eax
463 ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
464 ; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
465 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
466 ; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
467 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
468 ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
469 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
470 ; X86-SSE2-NEXT: movq %xmm0, (%esp)
471 ; X86-SSE2-NEXT: movq (%esp), %mm0
472 ; X86-SSE2-NEXT: paddd %mm0, %mm0
473 ; X86-SSE2-NEXT: movq %mm0, (%eax)
474 ; X86-SSE2-NEXT: movl %ebp, %esp
475 ; X86-SSE2-NEXT: popl %ebp
476 ; X86-SSE2-NEXT: retl
477 ;
478 ; X86-SSSE3-LABEL: build_v4i16_01zz:
479 ; X86-SSSE3: # %bb.0:
480 ; X86-SSSE3-NEXT: pushl %ebp
481 ; X86-SSSE3-NEXT: movl %esp, %ebp
482 ; X86-SSSE3-NEXT: andl $-8, %esp
483 ; X86-SSSE3-NEXT: subl $8, %esp
484 ; X86-SSSE3-NEXT: movl 8(%ebp), %eax
485 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
486 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
487 ; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
488 ; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
489 ; X86-SSSE3-NEXT: movq %xmm1, (%esp)
490 ; X86-SSSE3-NEXT: movq (%esp), %mm0
491 ; X86-SSSE3-NEXT: paddd %mm0, %mm0
492 ; X86-SSSE3-NEXT: movq %mm0, (%eax)
493 ; X86-SSSE3-NEXT: movl %ebp, %esp
494 ; X86-SSSE3-NEXT: popl %ebp
495 ; X86-SSSE3-NEXT: retl
496 ;
497 ; X64-SSE2-LABEL: build_v4i16_01zz:
498 ; X64-SSE2: # %bb.0:
499 ; X64-SSE2-NEXT: movd %edx, %xmm0
500 ; X64-SSE2-NEXT: movd %esi, %xmm1
501 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
502 ; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
503 ; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
504 ; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
505 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
506 ; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
507 ; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
508 ; X64-SSE2-NEXT: paddd %mm0, %mm0
509 ; X64-SSE2-NEXT: movq %mm0, (%rdi)
510 ; X64-SSE2-NEXT: retq
511 ;
512 ; X64-SSSE3-LABEL: build_v4i16_01zz:
513 ; X64-SSSE3: # %bb.0:
514 ; X64-SSSE3-NEXT: movd %edx, %xmm0
515 ; X64-SSSE3-NEXT: movd %esi, %xmm1
516 ; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
517 ; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
518 ; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
519 ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
520 ; X64-SSSE3-NEXT: paddd %mm0, %mm0
521 ; X64-SSSE3-NEXT: movq %mm0, (%rdi)
522 ; X64-SSSE3-NEXT: retq
523 ;
524 ; X64-AVX-LABEL: build_v4i16_01zz:
525 ; X64-AVX: # %bb.0:
526 ; X64-AVX-NEXT: vmovd %edx, %xmm0
527 ; X64-AVX-NEXT: vmovd %esi, %xmm1
528 ; X64-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
529 ; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
530 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
531 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
532 ; X64-AVX-NEXT: paddd %mm0, %mm0
533 ; X64-AVX-NEXT: movq %mm0, (%rdi)
534 ; X64-AVX-NEXT: retq
203 ; X86-LABEL: build_v4i16_01zz:
204 ; X86: # %bb.0:
205 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
206 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
207 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
208 ; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
209 ; X86-NEXT: pxor %mm0, %mm0
210 ; X86-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
211 ; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
212 ; X86-NEXT: paddd %mm1, %mm1
213 ; X86-NEXT: movq %mm1, (%eax)
214 ; X86-NEXT: retl
215 ;
216 ; X64-LABEL: build_v4i16_01zz:
217 ; X64: # %bb.0:
218 ; X64-NEXT: movd %edx, %mm0
219 ; X64-NEXT: movd %esi, %mm1
220 ; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
221 ; X64-NEXT: pxor %mm0, %mm0
222 ; X64-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
223 ; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
224 ; X64-NEXT: paddd %mm1, %mm1
225 ; X64-NEXT: movq %mm1, (%rdi)
226 ; X64-NEXT: retq
535227 %1 = insertelement <4 x i16> undef, i16 %a0, i32 0
536228 %2 = insertelement <4 x i16> %1, i16 %a1, i32 1
537229 %3 = insertelement <4 x i16> %2, i16 0, i32 2
595287 }
596288
597289 define void @build_v4i16_012u(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
598 ; X86-MMX-LABEL: build_v4i16_012u:
599 ; X86-MMX: # %bb.0:
600 ; X86-MMX-NEXT: pushl %ebp
601 ; X86-MMX-NEXT: movl %esp, %ebp
602 ; X86-MMX-NEXT: andl $-8, %esp
603 ; X86-MMX-NEXT: subl $8, %esp
604 ; X86-MMX-NEXT: movl 8(%ebp), %eax
605 ; X86-MMX-NEXT: movl 16(%ebp), %ecx
606 ; X86-MMX-NEXT: shll $16, %ecx
607 ; X86-MMX-NEXT: movzwl 12(%ebp), %edx
608 ; X86-MMX-NEXT: orl %ecx, %edx
609 ; X86-MMX-NEXT: movl %edx, (%esp)
610 ; X86-MMX-NEXT: shll $16, %ecx
611 ; X86-MMX-NEXT: movzwl 20(%ebp), %edx
612 ; X86-MMX-NEXT: orl %ecx, %edx
613 ; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
614 ; X86-MMX-NEXT: movq (%esp), %mm0
615 ; X86-MMX-NEXT: paddd %mm0, %mm0
616 ; X86-MMX-NEXT: movq %mm0, (%eax)
617 ; X86-MMX-NEXT: movl %ebp, %esp
618 ; X86-MMX-NEXT: popl %ebp
619 ; X86-MMX-NEXT: retl
620 ;
621 ; X86-SSE-LABEL: build_v4i16_012u:
622 ; X86-SSE: # %bb.0:
623 ; X86-SSE-NEXT: pushl %ebp
624 ; X86-SSE-NEXT: movl %esp, %ebp
625 ; X86-SSE-NEXT: andl $-8, %esp
626 ; X86-SSE-NEXT: subl $8, %esp
627 ; X86-SSE-NEXT: movl 8(%ebp), %eax
628 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
629 ; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0
630 ; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0
631 ; X86-SSE-NEXT: movq %xmm0, (%esp)
632 ; X86-SSE-NEXT: movq (%esp), %mm0
633 ; X86-SSE-NEXT: paddd %mm0, %mm0
634 ; X86-SSE-NEXT: movq %mm0, (%eax)
635 ; X86-SSE-NEXT: movl %ebp, %esp
636 ; X86-SSE-NEXT: popl %ebp
637 ; X86-SSE-NEXT: retl
638 ;
639 ; X64-SSE-LABEL: build_v4i16_012u:
640 ; X64-SSE: # %bb.0:
641 ; X64-SSE-NEXT: movd %edx, %xmm0
642 ; X64-SSE-NEXT: movd %esi, %xmm1
643 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
644 ; X64-SSE-NEXT: movd %ecx, %xmm0
645 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
646 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
647 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
648 ; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
649 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
650 ; X64-SSE-NEXT: paddd %mm0, %mm0
651 ; X64-SSE-NEXT: movq %mm0, (%rdi)
652 ; X64-SSE-NEXT: retq
653 ;
654 ; X64-AVX-LABEL: build_v4i16_012u:
655 ; X64-AVX: # %bb.0:
656 ; X64-AVX-NEXT: vmovd %esi, %xmm0
657 ; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
658 ; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
659 ; X64-AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
660 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
661 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
662 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
663 ; X64-AVX-NEXT: paddd %mm0, %mm0
664 ; X64-AVX-NEXT: movq %mm0, (%rdi)
665 ; X64-AVX-NEXT: retq
290 ; X86-LABEL: build_v4i16_012u:
291 ; X86: # %bb.0:
292 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
293 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
294 ; X86-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
295 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
296 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
297 ; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
298 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
299 ; X86-NEXT: paddd %mm2, %mm2
300 ; X86-NEXT: movq %mm2, (%eax)
301 ; X86-NEXT: retl
302 ;
303 ; X64-LABEL: build_v4i16_012u:
304 ; X64: # %bb.0:
305 ; X64-NEXT: movd %ecx, %mm0
306 ; X64-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
307 ; X64-NEXT: movd %edx, %mm1
308 ; X64-NEXT: movd %esi, %mm2
309 ; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
310 ; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
311 ; X64-NEXT: paddd %mm2, %mm2
312 ; X64-NEXT: movq %mm2, (%rdi)
313 ; X64-NEXT: retq
666314 %1 = insertelement <4 x i16> undef, i16 %a0, i32 0
667315 %2 = insertelement <4 x i16> %1, i16 %a1, i32 1
668316 %3 = insertelement <4 x i16> %2, i16 %a2, i32 2
676324 define void @build_v4i16_0u00(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
677325 ; X86-MMX-LABEL: build_v4i16_0u00:
678326 ; X86-MMX: # %bb.0:
679 ; X86-MMX-NEXT: pushl %ebp
680 ; X86-MMX-NEXT: movl %esp, %ebp
681 ; X86-MMX-NEXT: andl $-8, %esp
682 ; X86-MMX-NEXT: subl $8, %esp
683 ; X86-MMX-NEXT: movl 8(%ebp), %eax
684 ; X86-MMX-NEXT: movzwl 12(%ebp), %ecx
685 ; X86-MMX-NEXT: movl %ecx, %edx
686 ; X86-MMX-NEXT: shll $16, %edx
687 ; X86-MMX-NEXT: orl %ecx, %edx
688 ; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
689 ; X86-MMX-NEXT: shll $16, %edx
690 ; X86-MMX-NEXT: orl %ecx, %edx
691 ; X86-MMX-NEXT: movl %edx, (%esp)
692 ; X86-MMX-NEXT: movq (%esp), %mm0
327 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
328 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
329 ; X86-MMX-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
330 ; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
693331 ; X86-MMX-NEXT: paddd %mm0, %mm0
694332 ; X86-MMX-NEXT: movq %mm0, (%eax)
695 ; X86-MMX-NEXT: movl %ebp, %esp
696 ; X86-MMX-NEXT: popl %ebp
697333 ; X86-MMX-NEXT: retl
698334 ;
699 ; X86-SSE2-LABEL: build_v4i16_0u00:
700 ; X86-SSE2: # %bb.0:
701 ; X86-SSE2-NEXT: pushl %ebp
702 ; X86-SSE2-NEXT: movl %esp, %ebp
703 ; X86-SSE2-NEXT: andl $-8, %esp
704 ; X86-SSE2-NEXT: subl $8, %esp
705 ; X86-SSE2-NEXT: movl 8(%ebp), %eax
706 ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
707 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
708 ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
709 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
710 ; X86-SSE2-NEXT: movq %xmm0, (%esp)
711 ; X86-SSE2-NEXT: movq (%esp), %mm0
712 ; X86-SSE2-NEXT: paddd %mm0, %mm0
713 ; X86-SSE2-NEXT: movq %mm0, (%eax)
714 ; X86-SSE2-NEXT: movl %ebp, %esp
715 ; X86-SSE2-NEXT: popl %ebp
716 ; X86-SSE2-NEXT: retl
717 ;
718 ; X86-SSSE3-LABEL: build_v4i16_0u00:
719 ; X86-SSSE3: # %bb.0:
720 ; X86-SSSE3-NEXT: pushl %ebp
721 ; X86-SSSE3-NEXT: movl %esp, %ebp
722 ; X86-SSSE3-NEXT: andl $-8, %esp
723 ; X86-SSSE3-NEXT: subl $8, %esp
724 ; X86-SSSE3-NEXT: movl 8(%ebp), %eax
725 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
726 ; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
727 ; X86-SSSE3-NEXT: movq %xmm0, (%esp)
728 ; X86-SSSE3-NEXT: movq (%esp), %mm0
729 ; X86-SSSE3-NEXT: paddd %mm0, %mm0
730 ; X86-SSSE3-NEXT: movq %mm0, (%eax)
731 ; X86-SSSE3-NEXT: movl %ebp, %esp
732 ; X86-SSSE3-NEXT: popl %ebp
733 ; X86-SSSE3-NEXT: retl
734 ;
735 ; X64-SSE2-LABEL: build_v4i16_0u00:
736 ; X64-SSE2: # %bb.0:
737 ; X64-SSE2-NEXT: movd %esi, %xmm0
738 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
739 ; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
740 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
741 ; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
742 ; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
743 ; X64-SSE2-NEXT: paddd %mm0, %mm0
744 ; X64-SSE2-NEXT: movq %mm0, (%rdi)
745 ; X64-SSE2-NEXT: retq
746 ;
747 ; X64-SSSE3-LABEL: build_v4i16_0u00:
748 ; X64-SSSE3: # %bb.0:
749 ; X64-SSSE3-NEXT: movd %esi, %xmm0
750 ; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
751 ; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
752 ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
753 ; X64-SSSE3-NEXT: paddd %mm0, %mm0
754 ; X64-SSSE3-NEXT: movq %mm0, (%rdi)
755 ; X64-SSSE3-NEXT: retq
756 ;
757 ; X64-AVX1-LABEL: build_v4i16_0u00:
758 ; X64-AVX1: # %bb.0:
759 ; X64-AVX1-NEXT: vmovd %esi, %xmm0
760 ; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
761 ; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
762 ; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
763 ; X64-AVX1-NEXT: paddd %mm0, %mm0
764 ; X64-AVX1-NEXT: movq %mm0, (%rdi)
765 ; X64-AVX1-NEXT: retq
766 ;
767 ; X64-AVX2-LABEL: build_v4i16_0u00:
768 ; X64-AVX2: # %bb.0:
769 ; X64-AVX2-NEXT: vmovd %esi, %xmm0
770 ; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
771 ; X64-AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
772 ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
773 ; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
774 ; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
775 ; X64-AVX2-NEXT: paddd %mm0, %mm0
776 ; X64-AVX2-NEXT: movq %mm0, (%rdi)
777 ; X64-AVX2-NEXT: retq
778 ;
779 ; X64-AVX512-LABEL: build_v4i16_0u00:
780 ; X64-AVX512: # %bb.0:
781 ; X64-AVX512-NEXT: vmovd %esi, %xmm0
782 ; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
783 ; X64-AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
784 ; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
785 ; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
786 ; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
787 ; X64-AVX512-NEXT: paddd %mm0, %mm0
788 ; X64-AVX512-NEXT: movq %mm0, (%rdi)
789 ; X64-AVX512-NEXT: retq
335 ; X86-SSE-LABEL: build_v4i16_0u00:
336 ; X86-SSE: # %bb.0:
337 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
338 ; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0
339 ; X86-SSE-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
340 ; X86-SSE-NEXT: paddd %mm0, %mm0
341 ; X86-SSE-NEXT: movq %mm0, (%eax)
342 ; X86-SSE-NEXT: retl
343 ;
344 ; X64-LABEL: build_v4i16_0u00:
345 ; X64: # %bb.0:
346 ; X64-NEXT: movd %esi, %mm0
347 ; X64-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
348 ; X64-NEXT: paddd %mm0, %mm0
349 ; X64-NEXT: movq %mm0, (%rdi)
350 ; X64-NEXT: retq
790351 %1 = insertelement <4 x i16> undef, i16 %a0, i32 0
791352 %2 = insertelement <4 x i16> %1, i16 undef, i32 1
792353 %3 = insertelement <4 x i16> %2, i16 %a0, i32 2
802363 ;
803364
804365 define void @build_v8i8_01234567(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
805 ; X86-MMX-LABEL: build_v8i8_01234567:
806 ; X86-MMX: # %bb.0:
807 ; X86-MMX-NEXT: pushl %ebp
808 ; X86-MMX-NEXT: movl %esp, %ebp
809 ; X86-MMX-NEXT: pushl %esi
810 ; X86-MMX-NEXT: andl $-8, %esp
811 ; X86-MMX-NEXT: subl $16, %esp
812 ; X86-MMX-NEXT: movl 8(%ebp), %eax
813 ; X86-MMX-NEXT: movl 40(%ebp), %ecx
814 ; X86-MMX-NEXT: shll $8, %ecx
815 ; X86-MMX-NEXT: movzbl 36(%ebp), %edx
816 ; X86-MMX-NEXT: orl %ecx, %edx
817 ; X86-MMX-NEXT: shll $16, %edx
818 ; X86-MMX-NEXT: movl 32(%ebp), %ecx
819 ; X86-MMX-NEXT: shll $8, %ecx
820 ; X86-MMX-NEXT: movzbl 28(%ebp), %esi
821 ; X86-MMX-NEXT: orl %ecx, %esi
822 ; X86-MMX-NEXT: movzwl %si, %ecx
823 ; X86-MMX-NEXT: orl %edx, %ecx
824 ; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
825 ; X86-MMX-NEXT: movl 24(%ebp), %ecx
826 ; X86-MMX-NEXT: shll $8, %ecx
827 ; X86-MMX-NEXT: movzbl 20(%ebp), %edx
828 ; X86-MMX-NEXT: orl %ecx, %edx
829 ; X86-MMX-NEXT: shll $16, %edx
830 ; X86-MMX-NEXT: movl 16(%ebp), %ecx
831 ; X86-MMX-NEXT: shll $8, %ecx
832 ; X86-MMX-NEXT: movzbl 12(%ebp), %esi
833 ; X86-MMX-NEXT: orl %ecx, %esi
834 ; X86-MMX-NEXT: movzwl %si, %ecx
835 ; X86-MMX-NEXT: orl %edx, %ecx
836 ; X86-MMX-NEXT: movl %ecx, (%esp)
837 ; X86-MMX-NEXT: movq (%esp), %mm0
838 ; X86-MMX-NEXT: paddd %mm0, %mm0
839 ; X86-MMX-NEXT: movq %mm0, (%eax)
840 ; X86-MMX-NEXT: leal -4(%ebp), %esp
841 ; X86-MMX-NEXT: popl %esi
842 ; X86-MMX-NEXT: popl %ebp
843 ; X86-MMX-NEXT: retl
844 ;
845 ; X86-SSE-LABEL: build_v8i8_01234567:
846 ; X86-SSE: # %bb.0:
847 ; X86-SSE-NEXT: pushl %ebp
848 ; X86-SSE-NEXT: movl %esp, %ebp
849 ; X86-SSE-NEXT: pushl %esi
850 ; X86-SSE-NEXT: andl $-8, %esp
851 ; X86-SSE-NEXT: subl $16, %esp
852 ; X86-SSE-NEXT: movl 8(%ebp), %eax
853 ; X86-SSE-NEXT: movl 24(%ebp), %ecx
854 ; X86-SSE-NEXT: shll $8, %ecx
855 ; X86-SSE-NEXT: movzbl 20(%ebp), %edx
856 ; X86-SSE-NEXT: orl %ecx, %edx
857 ; X86-SSE-NEXT: movl 16(%ebp), %ecx
858 ; X86-SSE-NEXT: shll $8, %ecx
859 ; X86-SSE-NEXT: movzbl 12(%ebp), %esi
860 ; X86-SSE-NEXT: orl %ecx, %esi
861 ; X86-SSE-NEXT: movd %esi, %xmm0
862 ; X86-SSE-NEXT: pinsrw $1, %edx, %xmm0
863 ; X86-SSE-NEXT: movl 32(%ebp), %ecx
864 ; X86-SSE-NEXT: shll $8, %ecx
865 ; X86-SSE-NEXT: movzbl 28(%ebp), %edx
866 ; X86-SSE-NEXT: orl %ecx, %edx
867 ; X86-SSE-NEXT: pinsrw $2, %edx, %xmm0
868 ; X86-SSE-NEXT: movl 40(%ebp), %ecx
869 ; X86-SSE-NEXT: shll $8, %ecx
870 ; X86-SSE-NEXT: movzbl 36(%ebp), %edx
871 ; X86-SSE-NEXT: orl %ecx, %edx
872 ; X86-SSE-NEXT: pinsrw $3, %edx, %xmm0
873 ; X86-SSE-NEXT: movq %xmm0, (%esp)
874 ; X86-SSE-NEXT: movq (%esp), %mm0
875 ; X86-SSE-NEXT: paddd %mm0, %mm0
876 ; X86-SSE-NEXT: movq %mm0, (%eax)
877 ; X86-SSE-NEXT: leal -4(%ebp), %esp
878 ; X86-SSE-NEXT: popl %esi
879 ; X86-SSE-NEXT: popl %ebp
880 ; X86-SSE-NEXT: retl
881 ;
882 ; X64-SSE-LABEL: build_v8i8_01234567:
883 ; X64-SSE: # %bb.0:
884 ; X64-SSE-NEXT: shll $8, %r8d
885 ; X64-SSE-NEXT: movzbl %cl, %eax
886 ; X64-SSE-NEXT: orl %r8d, %eax
887 ; X64-SSE-NEXT: shll $8, %edx
888 ; X64-SSE-NEXT: movzbl %sil, %ecx
889 ; X64-SSE-NEXT: orl %edx, %ecx
890 ; X64-SSE-NEXT: movd %ecx, %xmm0
891 ; X64-SSE-NEXT: pinsrw $1, %eax, %xmm0
892 ; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax
893 ; X64-SSE-NEXT: shll $8, %eax
894 ; X64-SSE-NEXT: movzbl %r9b, %ecx
895 ; X64-SSE-NEXT: orl %eax, %ecx
896 ; X64-SSE-NEXT: pinsrw $2, %ecx, %xmm0
897 ; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax
898 ; X64-SSE-NEXT: shll $8, %eax
899 ; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
900 ; X64-SSE-NEXT: orl %eax, %ecx
901 ; X64-SSE-NEXT: pinsrw $3, %ecx, %xmm0
902 ; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
903 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
904 ; X64-SSE-NEXT: paddd %mm0, %mm0
905 ; X64-SSE-NEXT: movq %mm0, (%rdi)
906 ; X64-SSE-NEXT: retq
907 ;
908 ; X64-AVX-LABEL: build_v8i8_01234567:
909 ; X64-AVX: # %bb.0:
910 ; X64-AVX-NEXT: vmovd %esi, %xmm0
911 ; X64-AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
912 ; X64-AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
913 ; X64-AVX-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
914 ; X64-AVX-NEXT: vpinsrb $4, %r9d, %xmm0, %xmm0
915 ; X64-AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
916 ; X64-AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
917 ; X64-AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
918 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
919 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
920 ; X64-AVX-NEXT: paddd %mm0, %mm0
921 ; X64-AVX-NEXT: movq %mm0, (%rdi)
922 ; X64-AVX-NEXT: retq
366 ; X86-LABEL: build_v8i8_01234567:
367 ; X86: # %bb.0:
368 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
369 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
370 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
371 ; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
372 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
373 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
374 ; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
375 ; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
376 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
377 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
378 ; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
379 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
380 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm3
381 ; X86-NEXT: punpcklbw %mm0, %mm3 # mm3 = mm3[0],mm0[0],mm3[1],mm0[1],mm3[2],mm0[2],mm3[3],mm0[3]
382 ; X86-NEXT: punpcklwd %mm1, %mm3 # mm3 = mm3[0],mm1[0],mm3[1],mm1[1]
383 ; X86-NEXT: punpckldq %mm2, %mm3 # mm3 = mm3[0],mm2[0]
384 ; X86-NEXT: paddd %mm3, %mm3
385 ; X86-NEXT: movq %mm3, (%eax)
386 ; X86-NEXT: retl
387 ;
388 ; X64-LABEL: build_v8i8_01234567:
389 ; X64: # %bb.0:
390 ; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm0
391 ; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm1
392 ; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
393 ; X64-NEXT: movd %r9d, %mm0
394 ; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm2
395 ; X64-NEXT: punpcklbw %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1],mm0[2],mm2[2],mm0[3],mm2[3]
396 ; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
397 ; X64-NEXT: movd %r8d, %mm1
398 ; X64-NEXT: movd %ecx, %mm2
399 ; X64-NEXT: punpcklbw %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1],mm2[2],mm1[2],mm2[3],mm1[3]
400 ; X64-NEXT: movd %edx, %mm1
401 ; X64-NEXT: movd %esi, %mm3
402 ; X64-NEXT: punpcklbw %mm1, %mm3 # mm3 = mm3[0],mm1[0],mm3[1],mm1[1],mm3[2],mm1[2],mm3[3],mm1[3]
403 ; X64-NEXT: punpcklwd %mm2, %mm3 # mm3 = mm3[0],mm2[0],mm3[1],mm2[1]
404 ; X64-NEXT: punpckldq %mm0, %mm3 # mm3 = mm3[0],mm0[0]
405 ; X64-NEXT: paddd %mm3, %mm3
406 ; X64-NEXT: movq %mm3, (%rdi)
407 ; X64-NEXT: retq
923408 %1 = insertelement <8 x i8> undef, i8 %a0, i32 0
924409 %2 = insertelement <8 x i8> %1, i8 %a1, i32 1
925410 %3 = insertelement <8 x i8> %2, i8 %a2, i32 2
935420 }
936421
937422 define void @build_v8i8_0u2345z7(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
938 ; X86-MMX-LABEL: build_v8i8_0u2345z7:
939 ; X86-MMX: # %bb.0:
940 ; X86-MMX-NEXT: pushl %ebp
941 ; X86-MMX-NEXT: movl %esp, %ebp
942 ; X86-MMX-NEXT: andl $-8, %esp
943 ; X86-MMX-NEXT: subl $8, %esp
944 ; X86-MMX-NEXT: movl 8(%ebp), %eax
945 ; X86-MMX-NEXT: movl 24(%ebp), %ecx
946 ; X86-MMX-NEXT: shll $8, %ecx
947 ; X86-MMX-NEXT: movzbl 20(%ebp), %edx
948 ; X86-MMX-NEXT: orl %ecx, %edx
949 ; X86-MMX-NEXT: shll $16, %edx
950 ; X86-MMX-NEXT: movzbl 12(%ebp), %ecx
951 ; X86-MMX-NEXT: orl %edx, %ecx
952 ; X86-MMX-NEXT: movl %ecx, (%esp)
953 ; X86-MMX-NEXT: movl 32(%ebp), %ecx
954 ; X86-MMX-NEXT: shll $8, %ecx
955 ; X86-MMX-NEXT: movzbl 28(%ebp), %edx
956 ; X86-MMX-NEXT: orl %ecx, %edx
957 ; X86-MMX-NEXT: movzwl %dx, %ecx
958 ; X86-MMX-NEXT: movl 40(%ebp), %edx
959 ; X86-MMX-NEXT: shll $24, %edx
960 ; X86-MMX-NEXT: orl %ecx, %edx
961 ; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
962 ; X86-MMX-NEXT: movq (%esp), %mm0
963 ; X86-MMX-NEXT: paddd %mm0, %mm0
964 ; X86-MMX-NEXT: movq %mm0, (%eax)
965 ; X86-MMX-NEXT: movl %ebp, %esp
966 ; X86-MMX-NEXT: popl %ebp
967 ; X86-MMX-NEXT: retl
968 ;
969 ; X86-SSE2-LABEL: build_v8i8_0u2345z7:
970 ; X86-SSE2: # %bb.0:
971 ; X86-SSE2-NEXT: pushl %ebp
972 ; X86-SSE2-NEXT: movl %esp, %ebp
973 ; X86-SSE2-NEXT: andl $-8, %esp
974 ; X86-SSE2-NEXT: subl $8, %esp
975 ; X86-SSE2-NEXT: movl 8(%ebp), %eax
976 ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
977 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1
978 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
979 ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
980 ; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
981 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
982 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
983 ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
984 ; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
985 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
986 ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
987 ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
988 ; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
989 ; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
990 ; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
991 ; X86-SSE2-NEXT: movq %xmm0, (%esp)
992 ; X86-SSE2-NEXT: movq (%esp), %mm0
993 ; X86-SSE2-NEXT: paddd %mm0, %mm0
994 ; X86-SSE2-NEXT: movq %mm0, (%eax)
995 ; X86-SSE2-NEXT: movl %ebp, %esp
996 ; X86-SSE2-NEXT: popl %ebp
997 ; X86-SSE2-NEXT: retl
998 ;
999 ; X86-SSSE3-LABEL: build_v8i8_0u2345z7:
1000 ; X86-SSSE3: # %bb.0:
1001 ; X86-SSSE3-NEXT: pushl %ebp
1002 ; X86-SSSE3-NEXT: movl %esp, %ebp
1003 ; X86-SSSE3-NEXT: andl $-8, %esp
1004 ; X86-SSSE3-NEXT: subl $8, %esp
1005 ; X86-SSSE3-NEXT: movl 8(%ebp), %eax
1006 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1007 ; X86-SSSE3-NEXT: pxor %xmm1, %xmm1
1008 ; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1009 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1010 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1011 ; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1012 ; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1013 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1014 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1015 ; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1016 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1017 ; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1018 ; X86-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1019 ; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u]
1020 ; X86-SSSE3-NEXT: movq %xmm0, (%esp)
1021 ; X86-SSSE3-NEXT: movq (%esp), %mm0
1022 ; X86-SSSE3-NEXT: paddd %mm0, %mm0
1023 ; X86-SSSE3-NEXT: movq %mm0, (%eax)
1024 ; X86-SSSE3-NEXT: movl %ebp, %esp
1025 ; X86-SSSE3-NEXT: popl %ebp
1026 ; X86-SSSE3-NEXT: retl
1027 ;
1028 ; X64-SSE2-LABEL: build_v8i8_0u2345z7:
1029 ; X64-SSE2: # %bb.0:
1030 ; X64-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1031 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1
1032 ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1033 ; X64-SSE2-NEXT: movd %r9d, %xmm0
1034 ; X64-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1035 ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1036 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1037 ; X64-SSE2-NEXT: movd %r8d, %xmm1
1038 ; X64-SSE2-NEXT: movd %ecx, %xmm2
1039 ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1040 ; X64-SSE2-NEXT: movd %esi, %xmm1
1041 ; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1042 ; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1043 ; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm1
1044 ; X64-SSE2-NEXT: packuswb %xmm1, %xmm1
1045 ; X64-SSE2-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
1046 ; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1047 ; X64-SSE2-NEXT: paddd %mm0, %mm0
1048 ; X64-SSE2-NEXT: movq %mm0, (%rdi)
1049 ; X64-SSE2-NEXT: retq
1050 ;
1051 ; X64-SSSE3-LABEL: build_v8i8_0u2345z7:
1052 ; X64-SSSE3: # %bb.0:
1053 ; X64-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1054 ; X64-SSSE3-NEXT: pxor %xmm1, %xmm1
1055 ; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1056 ; X64-SSSE3-NEXT: movd %r9d, %xmm0
1057 ; X64-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1058 ; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1059 ; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1060 ; X64-SSSE3-NEXT: movd %r8d, %xmm1
1061 ; X64-SSSE3-NEXT: movd %ecx, %xmm2
1062 ; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1063 ; X64-SSSE3-NEXT: movd %esi, %xmm1
1064 ; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1065 ; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1066 ; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,u,4,6,8,10],zero,xmm1[14,u,u,u,u,u,u,u,u]
1067 ; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
1068 ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1069 ; X64-SSSE3-NEXT: paddd %mm0, %mm0
1070 ; X64-SSSE3-NEXT: movq %mm0, (%rdi)
1071 ; X64-SSSE3-NEXT: retq
1072 ;
1073 ; X64-AVX-LABEL: build_v8i8_0u2345z7:
1074 ; X64-AVX: # %bb.0:
1075 ; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1076 ; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0
1077 ; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
1078 ; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
1079 ; X64-AVX-NEXT: vpinsrw $4, %r9d, %xmm0, %xmm0
1080 ; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax
1081 ; X64-AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
1082 ; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax
1083 ; X64-AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
1084 ; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u]
1085 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
1086 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1087 ; X64-AVX-NEXT: paddd %mm0, %mm0
1088 ; X64-AVX-NEXT: movq %mm0, (%rdi)
1089 ; X64-AVX-NEXT: retq
423 ; X86-LABEL: build_v8i8_0u2345z7:
424 ; X86: # %bb.0:
425 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
426 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
427 ; X86-NEXT: pxor %mm1, %mm1
428 ; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
429 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
430 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
431 ; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
432 ; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
433 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
434 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
435 ; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
436 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
437 ; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
438 ; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
439 ; X86-NEXT: punpckldq %mm2, %mm0 # mm0 = mm0[0],mm2[0]
440 ; X86-NEXT: paddd %mm0, %mm0
441 ; X86-NEXT: movq %mm0, (%eax)
442 ; X86-NEXT: retl
443 ;
444 ; X64-LABEL: build_v8i8_0u2345z7:
445 ; X64: # %bb.0:
446 ; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm0
447 ; X64-NEXT: pxor %mm1, %mm1
448 ; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
449 ; X64-NEXT: movd %r9d, %mm0
450 ; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm2
451 ; X64-NEXT: punpcklbw %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1],mm0[2],mm2[2],mm0[3],mm2[3]
452 ; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
453 ; X64-NEXT: movd %r8d, %mm1
454 ; X64-NEXT: movd %ecx, %mm2
455 ; X64-NEXT: punpcklbw %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1],mm2[2],mm1[2],mm2[3],mm1[3]
456 ; X64-NEXT: movd %esi, %mm1
457 ; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
458 ; X64-NEXT: punpcklwd %mm2, %mm1 # mm1 = mm1[0],mm2[0],mm1[1],mm2[1]
459 ; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
460 ; X64-NEXT: paddd %mm1, %mm1
461 ; X64-NEXT: movq %mm1, (%rdi)
462 ; X64-NEXT: retq
1090463 %1 = insertelement <8 x i8> undef, i8 %a0, i32 0
1091464 %2 = insertelement <8 x i8> %1, i8 undef, i32 1
1092465 %3 = insertelement <8 x i8> %2, i8 %a2, i32 2
1102475 }
1103476
1104477 define void @build_v8i8_0123zzzu(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
1105 ; X86-MMX-LABEL: build_v8i8_0123zzzu:
1106 ; X86-MMX: # %bb.0:
1107 ; X86-MMX-NEXT: pushl %ebp
1108 ; X86-MMX-NEXT: movl %esp, %ebp
1109 ; X86-MMX-NEXT: pushl %esi
1110 ; X86-MMX-NEXT: andl $-8, %esp
1111 ; X86-MMX-NEXT: subl $16, %esp
1112 ; X86-MMX-NEXT: movl 8(%ebp), %eax
1113 ; X86-MMX-NEXT: movl 24(%ebp), %ecx
1114 ; X86-MMX-NEXT: shll $8, %ecx
1115 ; X86-MMX-NEXT: movzbl 20(%ebp), %edx
1116 ; X86-MMX-NEXT: orl %ecx, %edx
1117 ; X86-MMX-NEXT: shll $16, %edx
1118 ; X86-MMX-NEXT: movl 16(%ebp), %ecx
1119 ; X86-MMX-NEXT: shll $8, %ecx
1120 ; X86-MMX-NEXT: movzbl 12(%ebp), %esi
1121 ; X86-MMX-NEXT: orl %ecx, %esi
1122 ; X86-MMX-NEXT: movzwl %si, %ecx
1123 ; X86-MMX-NEXT: orl %edx, %ecx
1124 ; X86-MMX-NEXT: movl %ecx, (%esp)
1125 ; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
1126 ; X86-MMX-NEXT: movq (%esp), %mm0
1127 ; X86-MMX-NEXT: paddd %mm0, %mm0
1128 ; X86-MMX-NEXT: movq %mm0, (%eax)
1129 ; X86-MMX-NEXT: leal -4(%ebp), %esp
1130 ; X86-MMX-NEXT: popl %esi
1131 ; X86-MMX-NEXT: popl %ebp
1132 ; X86-MMX-NEXT: retl
1133 ;
1134 ; X86-SSE2-LABEL: build_v8i8_0123zzzu:
1135 ; X86-SSE2: # %bb.0:
1136 ; X86-SSE2-NEXT: pushl %ebp
1137 ; X86-SSE2-NEXT: movl %esp, %ebp
1138 ; X86-SSE2-NEXT: andl $-8, %esp
1139 ; X86-SSE2-NEXT: subl $8, %esp
1140 ; X86-SSE2-NEXT: movl 8(%ebp), %eax
1141 ; X86-SSE2-NEXT: movl 12(%ebp), %ecx
1142 ; X86-SSE2-NEXT: pxor %xmm0, %xmm0
1143 ; X86-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
1144 ; X86-SSE2-NEXT: movl 16(%ebp), %ecx
1145 ; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
1146 ; X86-SSE2-NEXT: movl 20(%ebp), %ecx
1147 ; X86-SSE2-NEXT: pinsrw $2, %ecx, %xmm0
1148 ; X86-SSE2-NEXT: movl 24(%ebp), %ecx
1149 ; X86-SSE2-NEXT: pinsrw $3, %ecx, %xmm0
1150 ; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
1151 ; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
1152 ; X86-SSE2-NEXT: movq %xmm0, (%esp)
1153 ; X86-SSE2-NEXT: movq (%esp), %mm0
1154 ; X86-SSE2-NEXT: paddd %mm0, %mm0
1155 ; X86-SSE2-NEXT: movq %mm0, (%eax)
1156 ; X86-SSE2-NEXT: movl %ebp, %esp
1157 ; X86-SSE2-NEXT: popl %ebp
1158 ; X86-SSE2-NEXT: retl
1159 ;
1160 ; X86-SSSE3-LABEL: build_v8i8_0123zzzu:
1161 ; X86-SSSE3: # %bb.0:
1162 ; X86-SSSE3-NEXT: pushl %ebp
1163 ; X86-SSSE3-NEXT: movl %esp, %ebp
1164 ; X86-SSSE3-NEXT: andl $-8, %esp
1165 ; X86-SSSE3-NEXT: subl $8, %esp
1166 ; X86-SSSE3-NEXT: movl 8(%ebp), %eax
1167 ; X86-SSSE3-NEXT: movl 12(%ebp), %ecx
1168 ; X86-SSSE3-NEXT: pxor %xmm0, %xmm0
1169 ; X86-SSSE3-NEXT: pinsrw $0, %ecx, %xmm0
1170 ; X86-SSSE3-NEXT: movl 16(%ebp), %ecx
1171 ; X86-SSSE3-NEXT: pinsrw $1, %ecx, %xmm0
1172 ; X86-SSSE3-NEXT: movl 20(%ebp), %ecx
1173 ; X86-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0
1174 ; X86-SSSE3-NEXT: movl 24(%ebp), %ecx
1175 ; X86-SSSE3-NEXT: pinsrw $3, %ecx, %xmm0
1176 ; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
1177 ; X86-SSSE3-NEXT: movq %xmm0, (%esp)
1178 ; X86-SSSE3-NEXT: movq (%esp), %mm0
1179 ; X86-SSSE3-NEXT: paddd %mm0, %mm0
1180 ; X86-SSSE3-NEXT: movq %mm0, (%eax)
1181 ; X86-SSSE3-NEXT: movl %ebp, %esp
1182 ; X86-SSSE3-NEXT: popl %ebp
1183 ; X86-SSSE3-NEXT: retl
1184 ;
1185 ; X64-SSE2-LABEL: build_v8i8_0123zzzu:
1186 ; X64-SSE2: # %bb.0:
1187 ; X64-SSE2-NEXT: pxor %xmm0, %xmm0
1188 ; X64-SSE2-NEXT: pinsrw $0, %esi, %xmm0
1189 ; X64-SSE2-NEXT: pinsrw $1, %edx, %xmm0
1190 ; X64-SSE2-NEXT: pinsrw $2, %ecx, %xmm0
1191 ; X64-SSE2-NEXT: pinsrw $3, %r8d, %xmm0
1192 ; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1193 ; X64-SSE2-NEXT: packuswb %xmm0, %xmm0
1194 ; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
1195 ; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1196 ; X64-SSE2-NEXT: paddd %mm0, %mm0
1197 ; X64-SSE2-NEXT: movq %mm0, (%rdi)
1198 ; X64-SSE2-NEXT: retq
1199 ;
1200 ; X64-SSSE3-LABEL: build_v8i8_0123zzzu:
1201 ; X64-SSSE3: # %bb.0:
1202 ; X64-SSSE3-NEXT: pxor %xmm0, %xmm0
1203 ; X64-SSSE3-NEXT: pinsrw $0, %esi, %xmm0
1204 ; X64-SSSE3-NEXT: pinsrw $1, %edx, %xmm0
1205 ; X64-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0
1206 ; X64-SSSE3-NEXT: pinsrw $3, %r8d, %xmm0
1207 ; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
1208 ; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
1209 ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1210 ; X64-SSSE3-NEXT: paddd %mm0, %mm0
1211 ; X64-SSSE3-NEXT: movq %mm0, (%rdi)
1212 ; X64-SSSE3-NEXT: retq
1213 ;
1214 ; X64-AVX-LABEL: build_v8i8_0123zzzu:
1215 ; X64-AVX: # %bb.0:
1216 ; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
1217 ; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0
1218 ; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0
1219 ; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
1220 ; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
1221 ; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
1222 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
1223 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1224 ; X64-AVX-NEXT: paddd %mm0, %mm0
1225 ; X64-AVX-NEXT: movq %mm0, (%rdi)
1226 ; X64-AVX-NEXT: retq
478 ; X86-LABEL: build_v8i8_0123zzzu:
479 ; X86: # %bb.0:
480 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
481 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
482 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
483 ; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
484 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
485 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
486 ; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
487 ; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
488 ; X86-NEXT: pxor %mm0, %mm0
489 ; X86-NEXT: pxor %mm1, %mm1
490 ; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
491 ; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
492 ; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
493 ; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
494 ; X86-NEXT: paddd %mm2, %mm2
495 ; X86-NEXT: movq %mm2, (%eax)
496 ; X86-NEXT: retl
497 ;
498 ; X64-LABEL: build_v8i8_0123zzzu:
499 ; X64: # %bb.0:
500 ; X64-NEXT: movd %r8d, %mm0
501 ; X64-NEXT: movd %ecx, %mm1
502 ; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
503 ; X64-NEXT: movd %edx, %mm0
504 ; X64-NEXT: movd %esi, %mm2
505 ; X64-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
506 ; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
507 ; X64-NEXT: pxor %mm0, %mm0
508 ; X64-NEXT: pxor %mm1, %mm1
509 ; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
510 ; X64-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
511 ; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
512 ; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
513 ; X64-NEXT: paddd %mm2, %mm2
514 ; X64-NEXT: movq %mm2, (%rdi)
515 ; X64-NEXT: retq
1227516 %1 = insertelement <8 x i8> undef, i8 %a0, i32 0
1228517 %2 = insertelement <8 x i8> %1, i8 %a1, i32 1
1229518 %3 = insertelement <8 x i8> %2, i8 %a2, i32 2
1301590 define void @build_v8i8_00000000(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
1302591 ; X86-MMX-LABEL: build_v8i8_00000000:
1303592 ; X86-MMX: # %bb.0:
1304 ; X86-MMX-NEXT: pushl %ebp
1305 ; X86-MMX-NEXT: movl %esp, %ebp
1306 ; X86-MMX-NEXT: andl $-8, %esp
1307 ; X86-MMX-NEXT: subl $8, %esp
1308 ; X86-MMX-NEXT: movl 8(%ebp), %eax
1309 ; X86-MMX-NEXT: movzbl 12(%ebp), %ecx
1310 ; X86-MMX-NEXT: movl %ecx, %edx
1311 ; X86-MMX-NEXT: shll $8, %edx
1312 ; X86-MMX-NEXT: orl %ecx, %edx
1313 ; X86-MMX-NEXT: movl %edx, %ecx
1314 ; X86-MMX-NEXT: shll $16, %ecx
1315 ; X86-MMX-NEXT: orl %edx, %ecx
1316 ; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1317 ; X86-MMX-NEXT: movl %ecx, (%esp)
1318 ; X86-MMX-NEXT: movq (%esp), %mm0
593 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
594 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
595 ; X86-MMX-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
596 ; X86-MMX-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
597 ; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
1319598 ; X86-MMX-NEXT: paddd %mm0, %mm0
1320599 ; X86-MMX-NEXT: movq %mm0, (%eax)
1321 ; X86-MMX-NEXT: movl %ebp, %esp
1322 ; X86-MMX-NEXT: popl %ebp
1323600 ; X86-MMX-NEXT: retl
1324601 ;
1325 ; X86-SSE2-LABEL: build_v8i8_00000000:
1326 ; X86-SSE2: # %bb.0:
1327 ; X86-SSE2-NEXT: pushl %ebp
1328 ; X86-SSE2-NEXT: movl %esp, %ebp
1329 ; X86-SSE2-NEXT: andl $-8, %esp
1330 ; X86-SSE2-NEXT: subl $8, %esp
1331 ; X86-SSE2-NEXT: movl 8(%ebp), %eax
1332 ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1333 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
1334 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1335 ; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
1336 ; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
1337 ; X86-SSE2-NEXT: movq %xmm0, (%esp)
1338 ; X86-SSE2-NEXT: movq (%esp), %mm0
1339 ; X86-SSE2-NEXT: paddd %mm0, %mm0
1340 ; X86-SSE2-NEXT: movq %mm0, (%eax)
1341 ; X86-SSE2-NEXT: movl %ebp, %esp
1342 ; X86-SSE2-NEXT: popl %ebp
1343 ; X86-SSE2-NEXT: retl
1344 ;
1345 ; X86-SSSE3-LABEL: build_v8i8_00000000:
1346 ; X86-SSSE3: # %bb.0:
1347 ; X86-SSSE3-NEXT: pushl %ebp
1348 ; X86-SSSE3-NEXT: movl %esp, %ebp
1349 ; X86-SSSE3-NEXT: andl $-8, %esp
1350 ; X86-SSSE3-NEXT: subl $8, %esp
1351 ; X86-SSSE3-NEXT: movl 8(%ebp), %eax
1352 ; X86-SSSE3-NEXT: pxor %xmm0, %xmm0
1353 ; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1354 ; X86-SSSE3-NEXT: pshufb %xmm0, %xmm1
1355 ; X86-SSSE3-NEXT: movq %xmm1, (%esp)
1356 ; X86-SSSE3-NEXT: movq (%esp), %mm0
1357 ; X86-SSSE3-NEXT: paddd %mm0, %mm0
1358 ; X86-SSSE3-NEXT: movq %mm0, (%eax)
1359 ; X86-SSSE3-NEXT: movl %ebp, %esp
1360 ; X86-SSSE3-NEXT: popl %ebp
1361 ; X86-SSSE3-NEXT: retl
1362 ;
1363 ; X64-SSE2-LABEL: build_v8i8_00000000:
1364 ; X64-SSE2: # %bb.0:
1365 ; X64-SSE2-NEXT: movd %esi, %xmm0
1366 ; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
1367 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1368 ; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0
1369 ; X64-SSE2-NEXT: packuswb %xmm0, %xmm0
1370 ; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
1371 ; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1372 ; X64-SSE2-NEXT: paddd %mm0, %mm0
1373 ; X64-SSE2-NEXT: movq %mm0, (%rdi)
1374 ; X64-SSE2-NEXT: retq
1375 ;
1376 ; X64-SSSE3-LABEL: build_v8i8_00000000:
1377 ; X64-SSSE3: # %bb.0:
1378 ; X64-SSSE3-NEXT: movd %esi, %xmm0
1379 ; X64-SSSE3-NEXT: pxor %xmm1, %xmm1
1380 ; X64-SSSE3-NEXT: pshufb %xmm1, %xmm0
1381 ; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
1382 ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1383 ; X64-SSSE3-NEXT: paddd %mm0, %mm0
1384 ; X64-SSSE3-NEXT: movq %mm0, (%rdi)
1385 ; X64-SSSE3-NEXT: retq
1386 ;
1387 ; X64-AVX1-LABEL: build_v8i8_00000000:
1388 ; X64-AVX1: # %bb.0:
1389 ; X64-AVX1-NEXT: vmovd %esi, %xmm0
1390 ; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1391 ; X64-AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
1392 ; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
1393 ; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1394 ; X64-AVX1-NEXT: paddd %mm0, %mm0
1395 ; X64-AVX1-NEXT: movq %mm0, (%rdi)
1396 ; X64-AVX1-NEXT: retq
1397 ;
1398 ; X64-AVX2-LABEL: build_v8i8_00000000:
1399 ; X64-AVX2: # %bb.0:
1400 ; X64-AVX2-NEXT: vmovd %esi, %xmm0
1401 ; X64-AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
1402 ; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
1403 ; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1404 ; X64-AVX2-NEXT: paddd %mm0, %mm0
1405 ; X64-AVX2-NEXT: movq %mm0, (%rdi)
1406 ; X64-AVX2-NEXT: retq
1407 ;
1408 ; X64-AVX512-LABEL: build_v8i8_00000000:
1409 ; X64-AVX512: # %bb.0:
1410 ; X64-AVX512-NEXT: vmovd %esi, %xmm0
1411 ; X64-AVX512-NEXT: vpbroadcastb %xmm0, %xmm0
1412 ; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
1413 ; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1414 ; X64-AVX512-NEXT: paddd %mm0, %mm0
1415 ; X64-AVX512-NEXT: movq %mm0, (%rdi)
1416 ; X64-AVX512-NEXT: retq
602 ; X86-SSE-LABEL: build_v8i8_00000000:
603 ; X86-SSE: # %bb.0:
604 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
605 ; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0
606 ; X86-SSE-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
607 ; X86-SSE-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
608 ; X86-SSE-NEXT: paddd %mm0, %mm0
609 ; X86-SSE-NEXT: movq %mm0, (%eax)
610 ; X86-SSE-NEXT: retl
611 ;
612 ; X64-LABEL: build_v8i8_00000000:
613 ; X64: # %bb.0:
614 ; X64-NEXT: movd %esi, %mm0
615 ; X64-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
616 ; X64-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
617 ; X64-NEXT: paddd %mm0, %mm0
618 ; X64-NEXT: movq %mm0, (%rdi)
619 ; X64-NEXT: retq
1417620 %1 = insertelement <8 x i8> undef, i8 %a0, i32 0
1418621 %2 = insertelement <8 x i8> %1, i8 %a0, i32 1
1419622 %3 = insertelement <8 x i8> %2, i8 %a0, i32 2
1435638 define void @build_v2f32_01(x86_mmx *%p0, float %a0, float %a1) nounwind {
1436639 ; X86-MMX-LABEL: build_v2f32_01:
1437640 ; X86-MMX: # %bb.0:
1438 ; X86-MMX-NEXT: pushl %ebp
1439 ; X86-MMX-NEXT: movl %esp, %ebp
1440 ; X86-MMX-NEXT: andl $-8, %esp
1441 ; X86-MMX-NEXT: subl $8, %esp
1442 ; X86-MMX-NEXT: movl 8(%ebp), %eax
1443 ; X86-MMX-NEXT: flds 12(%ebp)
1444 ; X86-MMX-NEXT: flds 16(%ebp)
1445 ; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
1446 ; X86-MMX-NEXT: fstps (%esp)
1447 ; X86-MMX-NEXT: movq (%esp), %mm0
1448 ; X86-MMX-NEXT: paddd %mm0, %mm0
1449 ; X86-MMX-NEXT: movq %mm0, (%eax)
1450 ; X86-MMX-NEXT: movl %ebp, %esp
1451 ; X86-MMX-NEXT: popl %ebp
641 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
642 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
643 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm1
644 ; X86-MMX-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
645 ; X86-MMX-NEXT: paddd %mm1, %mm1
646 ; X86-MMX-NEXT: movq %mm1, (%eax)
1452647 ; X86-MMX-NEXT: retl
1453648 ;
1454649 ; X86-SSE-LABEL: build_v2f32_01:
1455650 ; X86-SSE: # %bb.0:
1456 ; X86-SSE-NEXT: pushl %ebp
1457 ; X86-SSE-NEXT: movl %esp, %ebp
1458 ; X86-SSE-NEXT: andl $-16, %esp
1459 ; X86-SSE-NEXT: subl $32, %esp
1460 ; X86-SSE-NEXT: movl 8(%ebp), %eax
1461 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
1462 ; X86-SSE-NEXT: movaps %xmm0, (%esp)
1463 ; X86-SSE-NEXT: movq (%esp), %mm0
1464 ; X86-SSE-NEXT: paddd %mm0, %mm0
1465 ; X86-SSE-NEXT: movq %mm0, (%eax)
1466 ; X86-SSE-NEXT: movl %ebp, %esp
1467 ; X86-SSE-NEXT: popl %ebp
651 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
652 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
653 ; X86-SSE-NEXT: movdq2q %xmm0, %mm0
654 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
655 ; X86-SSE-NEXT: movdq2q %xmm0, %mm1
656 ; X86-SSE-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
657 ; X86-SSE-NEXT: paddd %mm1, %mm1
658 ; X86-SSE-NEXT: movq %mm1, (%eax)
1468659 ; X86-SSE-NEXT: retl
1469660 ;
1470 ; X64-SSE-LABEL: build_v2f32_01:
1471 ; X64-SSE: # %bb.0:
1472 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1473 ; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1474 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1475 ; X64-SSE-NEXT: paddd %mm0, %mm0
1476 ; X64-SSE-NEXT: movq %mm0, (%rdi)
1477 ; X64-SSE-NEXT: retq
1478 ;
1479 ; X64-AVX-LABEL: build_v2f32_01:
1480 ; X64-AVX: # %bb.0:
1481 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1482 ; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1483 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1484 ; X64-AVX-NEXT: paddd %mm0, %mm0
1485 ; X64-AVX-NEXT: movq %mm0, (%rdi)
1486 ; X64-AVX-NEXT: retq
661 ; X64-LABEL: build_v2f32_01:
662 ; X64: # %bb.0:
663 ; X64-NEXT: movdq2q %xmm1, %mm0
664 ; X64-NEXT: movdq2q %xmm0, %mm1
665 ; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
666 ; X64-NEXT: paddd %mm1, %mm1
667 ; X64-NEXT: movq %mm1, (%rdi)
668 ; X64-NEXT: retq
1487669 %1 = insertelement <2 x float> undef, float %a0, i32 0
1488670 %2 = insertelement <2 x float> %1, float %a1, i32 1
1489671 %3 = bitcast <2 x float> %2 to x86_mmx
1495677 define void @build_v2f32_0z(x86_mmx *%p0, float %a0, float %a1) nounwind {
1496678 ; X86-MMX-LABEL: build_v2f32_0z:
1497679 ; X86-MMX: # %bb.0:
1498 ; X86-MMX-NEXT: pushl %ebp
1499 ; X86-MMX-NEXT: movl %esp, %ebp
1500 ; X86-MMX-NEXT: andl $-8, %esp
1501 ; X86-MMX-NEXT: subl $8, %esp
1502 ; X86-MMX-NEXT: movl 8(%ebp), %eax
1503 ; X86-MMX-NEXT: flds 12(%ebp)
1504 ; X86-MMX-NEXT: fstps (%esp)
1505 ; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
1506 ; X86-MMX-NEXT: movq (%esp), %mm0
1507 ; X86-MMX-NEXT: paddd %mm0, %mm0
1508 ; X86-MMX-NEXT: movq %mm0, (%eax)
1509 ; X86-MMX-NEXT: movl %ebp, %esp
1510 ; X86-MMX-NEXT: popl %ebp
680 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
681 ; X86-MMX-NEXT: pxor %mm0, %mm0
682 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm1
683 ; X86-MMX-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
684 ; X86-MMX-NEXT: paddd %mm1, %mm1
685 ; X86-MMX-NEXT: movq %mm1, (%eax)
1511686 ; X86-MMX-NEXT: retl
1512687 ;
1513688 ; X86-SSE-LABEL: build_v2f32_0z:
1514689 ; X86-SSE: # %bb.0:
1515 ; X86-SSE-NEXT: pushl %ebp
1516 ; X86-SSE-NEXT: movl %esp, %ebp
1517 ; X86-SSE-NEXT: andl $-16, %esp
1518 ; X86-SSE-NEXT: subl $32, %esp
1519 ; X86-SSE-NEXT: movl 8(%ebp), %eax
690 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1520691 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1521 ; X86-SSE-NEXT: movaps %xmm0, (%esp)
1522 ; X86-SSE-NEXT: movq (%esp), %mm0
692 ; X86-SSE-NEXT: movdq2q %xmm0, %mm0
693 ; X86-SSE-NEXT: pxor %mm1, %mm1
694 ; X86-SSE-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
1523695 ; X86-SSE-NEXT: paddd %mm0, %mm0
1524696 ; X86-SSE-NEXT: movq %mm0, (%eax)
1525 ; X86-SSE-NEXT: movl %ebp, %esp
1526 ; X86-SSE-NEXT: popl %ebp
1527697 ; X86-SSE-NEXT: retl
1528698 ;
1529 ; X64-SSE-LABEL: build_v2f32_0z:
1530 ; X64-SSE: # %bb.0:
1531 ; X64-SSE-NEXT: xorps %xmm1, %xmm1
1532 ; X64-SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1533 ; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
1534 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1535 ; X64-SSE-NEXT: paddd %mm0, %mm0
1536 ; X64-SSE-NEXT: movq %mm0, (%rdi)
1537 ; X64-SSE-NEXT: retq
1538 ;
1539 ; X64-AVX1-LABEL: build_v2f32_0z:
1540 ; X64-AVX1: # %bb.0:
1541 ; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1542 ; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1543 ; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1544 ; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1545 ; X64-AVX1-NEXT: paddd %mm0, %mm0
1546 ; X64-AVX1-NEXT: movq %mm0, (%rdi)
1547 ; X64-AVX1-NEXT: retq
1548 ;
1549 ; X64-AVX2-LABEL: build_v2f32_0z:
1550 ; X64-AVX2: # %bb.0:
1551 ; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
1552 ; X64-AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1553 ; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1554 ; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1555 ; X64-AVX2-NEXT: paddd %mm0, %mm0
1556 ; X64-AVX2-NEXT: movq %mm0, (%rdi)
1557 ; X64-AVX2-NEXT: retq
1558 ;
1559 ; X64-AVX512-LABEL: build_v2f32_0z:
1560 ; X64-AVX512: # %bb.0:
1561 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
1562 ; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1563 ; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1564 ; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1565 ; X64-AVX512-NEXT: paddd %mm0, %mm0
1566 ; X64-AVX512-NEXT: movq %mm0, (%rdi)
1567 ; X64-AVX512-NEXT: retq
699 ; X64-LABEL: build_v2f32_0z:
700 ; X64: # %bb.0:
701 ; X64-NEXT: movdq2q %xmm0, %mm0
702 ; X64-NEXT: pxor %mm1, %mm1
703 ; X64-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
704 ; X64-NEXT: paddd %mm0, %mm0
705 ; X64-NEXT: movq %mm0, (%rdi)
706 ; X64-NEXT: retq
1568707 %1 = insertelement <2 x float> undef, float %a0, i32 0
1569708 %2 = insertelement <2 x float> %1, float 0.0, i32 1
1570709 %3 = bitcast <2 x float> %2 to x86_mmx
1576715 define void @build_v2f32_u1(x86_mmx *%p0, float %a0, float %a1) nounwind {
1577716 ; X86-MMX-LABEL: build_v2f32_u1:
1578717 ; X86-MMX: # %bb.0:
1579 ; X86-MMX-NEXT: pushl %ebp
1580 ; X86-MMX-NEXT: movl %esp, %ebp
1581 ; X86-MMX-NEXT: andl $-8, %esp
1582 ; X86-MMX-NEXT: subl $8, %esp
1583 ; X86-MMX-NEXT: movl 8(%ebp), %eax
1584 ; X86-MMX-NEXT: flds 16(%ebp)
1585 ; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
1586 ; X86-MMX-NEXT: movq (%esp), %mm0
718 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
719 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
720 ; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
1587721 ; X86-MMX-NEXT: paddd %mm0, %mm0
1588722 ; X86-MMX-NEXT: movq %mm0, (%eax)
1589 ; X86-MMX-NEXT: movl %ebp, %esp
1590 ; X86-MMX-NEXT: popl %ebp
1591723 ; X86-MMX-NEXT: retl
1592724 ;
1593 ; X86-SSE2-LABEL: build_v2f32_u1:
1594 ; X86-SSE2: # %bb.0:
1595 ; X86-SSE2-NEXT: pushl %ebp
1596 ; X86-SSE2-NEXT: movl %esp, %ebp
1597 ; X86-SSE2-NEXT: andl $-16, %esp
1598 ; X86-SSE2-NEXT: subl $32, %esp
1599 ; X86-SSE2-NEXT: movl 8(%ebp), %eax
1600 ; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1601 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
1602 ; X86-SSE2-NEXT: movaps %xmm0, (%esp)
1603 ; X86-SSE2-NEXT: movq (%esp), %mm0
1604 ; X86-SSE2-NEXT: paddd %mm0, %mm0
1605 ; X86-SSE2-NEXT: movq %mm0, (%eax)
1606 ; X86-SSE2-NEXT: movl %ebp, %esp
1607 ; X86-SSE2-NEXT: popl %ebp
1608 ; X86-SSE2-NEXT: retl
1609 ;
1610 ; X86-SSSE3-LABEL: build_v2f32_u1:
1611 ; X86-SSSE3: # %bb.0:
1612 ; X86-SSSE3-NEXT: pushl %ebp
1613 ; X86-SSSE3-NEXT: movl %esp, %ebp
1614 ; X86-SSSE3-NEXT: andl $-16, %esp
1615 ; X86-SSSE3-NEXT: subl $32, %esp
1616 ; X86-SSSE3-NEXT: movl 8(%ebp), %eax
1617 ; X86-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1618 ; X86-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
1619 ; X86-SSSE3-NEXT: movaps %xmm0, (%esp)
1620 ; X86-SSSE3-NEXT: movq (%esp), %mm0
1621 ; X86-SSSE3-NEXT: paddd %mm0, %mm0
1622 ; X86-SSSE3-NEXT: movq %mm0, (%eax)
1623 ; X86-SSSE3-NEXT: movl %ebp, %esp
1624 ; X86-SSSE3-NEXT: popl %ebp
1625 ; X86-SSSE3-NEXT: retl
1626 ;
1627 ; X64-SSE2-LABEL: build_v2f32_u1:
1628 ; X64-SSE2: # %bb.0:
1629 ; X64-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,2,3]
1630 ; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
1631 ; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1632 ; X64-SSE2-NEXT: paddd %mm0, %mm0
1633 ; X64-SSE2-NEXT: movq %mm0, (%rdi)
1634 ; X64-SSE2-NEXT: retq
1635 ;
1636 ; X64-SSSE3-LABEL: build_v2f32_u1:
1637 ; X64-SSSE3: # %bb.0:
1638 ; X64-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
1639 ; X64-SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1640 ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1641 ; X64-SSSE3-NEXT: paddd %mm0, %mm0
1642 ; X64-SSSE3-NEXT: movq %mm0, (%rdi)
1643 ; X64-SSSE3-NEXT: retq
1644 ;
1645 ; X64-AVX1-LABEL: build_v2f32_u1:
1646 ; X64-AVX1: # %bb.0:
1647 ; X64-AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
1648 ; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1649 ; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1650 ; X64-AVX1-NEXT: paddd %mm0, %mm0
1651 ; X64-AVX1-NEXT: movq %mm0, (%rdi)
1652 ; X64-AVX1-NEXT: retq
1653 ;
1654 ; X64-AVX2-LABEL: build_v2f32_u1:
1655 ; X64-AVX2: # %bb.0:
1656 ; X64-AVX2-NEXT: vbroadcastss %xmm1, %xmm0
1657 ; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1658 ; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1659 ; X64-AVX2-NEXT: paddd %mm0, %mm0
1660 ; X64-AVX2-NEXT: movq %mm0, (%rdi)
1661 ; X64-AVX2-NEXT: retq
1662 ;
1663 ; X64-AVX512-LABEL: build_v2f32_u1:
1664 ; X64-AVX512: # %bb.0:
1665 ; X64-AVX512-NEXT: vbroadcastss %xmm1, %xmm0
1666 ; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1667 ; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1668 ; X64-AVX512-NEXT: paddd %mm0, %mm0
1669 ; X64-AVX512-NEXT: movq %mm0, (%rdi)
1670 ; X64-AVX512-NEXT: retq
725 ; X86-SSE-LABEL: build_v2f32_u1:
726 ; X86-SSE: # %bb.0:
727 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
728 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
729 ; X86-SSE-NEXT: movdq2q %xmm0, %mm0
730 ; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
731 ; X86-SSE-NEXT: paddd %mm0, %mm0
732 ; X86-SSE-NEXT: movq %mm0, (%eax)
733 ; X86-SSE-NEXT: retl
734 ;
735 ; X64-LABEL: build_v2f32_u1:
736 ; X64: # %bb.0:
737 ; X64-NEXT: movdq2q %xmm1, %mm0
738 ; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
739 ; X64-NEXT: paddd %mm0, %mm0
740 ; X64-NEXT: movq %mm0, (%rdi)
741 ; X64-NEXT: retq
1671742 %1 = insertelement <2 x float> undef, float undef, i32 0
1672743 %2 = insertelement <2 x float> %1, float %a1, i32 1
1673744 %3 = bitcast <2 x float> %2 to x86_mmx
1679750 define void @build_v2f32_z1(x86_mmx *%p0, float %a0, float %a1) nounwind {
1680751 ; X86-MMX-LABEL: build_v2f32_z1:
1681752 ; X86-MMX: # %bb.0:
1682 ; X86-MMX-NEXT: pushl %ebp
1683 ; X86-MMX-NEXT: movl %esp, %ebp
1684 ; X86-MMX-NEXT: andl $-8, %esp
1685 ; X86-MMX-NEXT: subl $8, %esp
1686 ; X86-MMX-NEXT: movl 8(%ebp), %eax
1687 ; X86-MMX-NEXT: flds 16(%ebp)
1688 ; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
1689 ; X86-MMX-NEXT: movl $0, (%esp)
1690 ; X86-MMX-NEXT: movq (%esp), %mm0
1691 ; X86-MMX-NEXT: paddd %mm0, %mm0
1692 ; X86-MMX-NEXT: movq %mm0, (%eax)
1693 ; X86-MMX-NEXT: movl %ebp, %esp
1694 ; X86-MMX-NEXT: popl %ebp
753 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
754 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
755 ; X86-MMX-NEXT: pxor %mm1, %mm1
756 ; X86-MMX-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
757 ; X86-MMX-NEXT: paddd %mm1, %mm1
758 ; X86-MMX-NEXT: movq %mm1, (%eax)
1695759 ; X86-MMX-NEXT: retl
1696760 ;
1697761 ; X86-SSE-LABEL: build_v2f32_z1:
1698762 ; X86-SSE: # %bb.0:
1699 ; X86-SSE-NEXT: pushl %ebp
1700 ; X86-SSE-NEXT: movl %esp, %ebp
1701 ; X86-SSE-NEXT: andl $-16, %esp
1702 ; X86-SSE-NEXT: subl $32, %esp
1703 ; X86-SSE-NEXT: movl 8(%ebp), %eax
763 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1704764 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1705 ; X86-SSE-NEXT: xorps %xmm1, %xmm1
1706 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
1707 ; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1708 ; X86-SSE-NEXT: movaps %xmm0, (%esp)
1709 ; X86-SSE-NEXT: movq (%esp), %mm0
1710 ; X86-SSE-NEXT: paddd %mm0, %mm0
1711 ; X86-SSE-NEXT: movq %mm0, (%eax)
1712 ; X86-SSE-NEXT: movl %ebp, %esp
1713 ; X86-SSE-NEXT: popl %ebp
765 ; X86-SSE-NEXT: movdq2q %xmm0, %mm0
766 ; X86-SSE-NEXT: pxor %mm1, %mm1
767 ; X86-SSE-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
768 ; X86-SSE-NEXT: paddd %mm1, %mm1
769 ; X86-SSE-NEXT: movq %mm1, (%eax)
1714770 ; X86-SSE-NEXT: retl
1715771 ;
1716 ; X64-SSE-LABEL: build_v2f32_z1:
1717 ; X64-SSE: # %bb.0:
1718 ; X64-SSE-NEXT: xorps %xmm0, %xmm0
1719 ; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
1720 ; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
1721 ; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
1722 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1723 ; X64-SSE-NEXT: paddd %mm0, %mm0
1724 ; X64-SSE-NEXT: movq %mm0, (%rdi)
1725 ; X64-SSE-NEXT: retq
1726 ;
1727 ; X64-AVX-LABEL: build_v2f32_z1:
1728 ; X64-AVX: # %bb.0:
1729 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
1730 ; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1731 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1732 ; X64-AVX-NEXT: paddd %mm0, %mm0
1733 ; X64-AVX-NEXT: movq %mm0, (%rdi)
1734 ; X64-AVX-NEXT: retq
772 ; X64-LABEL: build_v2f32_z1:
773 ; X64: # %bb.0:
774 ; X64-NEXT: movdq2q %xmm1, %mm0
775 ; X64-NEXT: pxor %mm1, %mm1
776 ; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
777 ; X64-NEXT: paddd %mm1, %mm1
778 ; X64-NEXT: movq %mm1, (%rdi)
779 ; X64-NEXT: retq
1735780 %1 = insertelement <2 x float> undef, float 0.0, i32 0
1736781 %2 = insertelement <2 x float> %1, float %a1, i32 1
1737782 %3 = bitcast <2 x float> %2 to x86_mmx
1743788 define void @build_v2f32_00(x86_mmx *%p0, float %a0, float %a1) nounwind {
1744789 ; X86-MMX-LABEL: build_v2f32_00:
1745790 ; X86-MMX: # %bb.0:
1746 ; X86-MMX-NEXT: pushl %ebp
1747 ; X86-MMX-NEXT: movl %esp, %ebp
1748 ; X86-MMX-NEXT: andl $-8, %esp
1749 ; X86-MMX-NEXT: subl $8, %esp
1750 ; X86-MMX-NEXT: movl 8(%ebp), %eax
1751 ; X86-MMX-NEXT: flds 12(%ebp)
1752 ; X86-MMX-NEXT: fsts {{[0-9]+}}(%esp)
1753 ; X86-MMX-NEXT: fstps (%esp)
1754 ; X86-MMX-NEXT: movq (%esp), %mm0
791 ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
792 ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
793 ; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
1755794 ; X86-MMX-NEXT: paddd %mm0, %mm0
1756795 ; X86-MMX-NEXT: movq %mm0, (%eax)
1757 ; X86-MMX-NEXT: movl %ebp, %esp
1758 ; X86-MMX-NEXT: popl %ebp
1759796 ; X86-MMX-NEXT: retl
1760797 ;
1761 ; X86-SSE2-LABEL: build_v2f32_00:
1762 ; X86-SSE2: # %bb.0:
1763 ; X86-SSE2-NEXT: pushl %ebp
1764 ; X86-SSE2-NEXT: movl %esp, %ebp
1765 ; X86-SSE2-NEXT: andl $-16, %esp
1766 ; X86-SSE2-NEXT: subl $32, %esp
1767 ; X86-SSE2-NEXT: movl 8(%ebp), %eax
1768 ; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1769 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
1770 ; X86-SSE2-NEXT: movaps %xmm0, (%esp)
1771 ; X86-SSE2-NEXT: movq (%esp), %mm0
1772 ; X86-SSE2-NEXT: paddd %mm0, %mm0
1773 ; X86-SSE2-NEXT: movq %mm0, (%eax)
1774 ; X86-SSE2-NEXT: movl %ebp, %esp
1775 ; X86-SSE2-NEXT: popl %ebp
1776 ; X86-SSE2-NEXT: retl
1777 ;
1778 ; X86-SSSE3-LABEL: build_v2f32_00:
1779 ; X86-SSSE3: # %bb.0:
1780 ; X86-SSSE3-NEXT: pushl %ebp
1781 ; X86-SSSE3-NEXT: movl %esp, %ebp
1782 ; X86-SSSE3-NEXT: andl $-16, %esp
1783 ; X86-SSSE3-NEXT: subl $32, %esp
1784 ; X86-SSSE3-NEXT: movl 8(%ebp), %eax
1785 ; X86-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1786 ; X86-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
1787 ; X86-SSSE3-NEXT: movaps %xmm0, (%esp)
1788 ; X86-SSSE3-NEXT: movq (%esp), %mm0
1789 ; X86-SSSE3-NEXT: paddd %mm0, %mm0
1790 ; X86-SSSE3-NEXT: movq %mm0, (%eax)
1791 ; X86-SSSE3-NEXT: movl %ebp, %esp
1792 ; X86-SSSE3-NEXT: popl %ebp
1793 ; X86-SSSE3-NEXT: retl
1794 ;
1795 ; X64-SSE2-LABEL: build_v2f32_00:
1796 ; X64-SSE2: # %bb.0:
1797 ; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
1798 ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1799 ; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1800 ; X64-SSE2-NEXT: paddd %mm0, %mm0
1801 ; X64-SSE2-NEXT: movq %mm0, (%rdi)
1802 ; X64-SSE2-NEXT: retq
1803 ;
1804 ; X64-SSSE3-LABEL: build_v2f32_00:
1805 ; X64-SSSE3: # %bb.0:
1806 ; X64-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
1807 ; X64-SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1808 ; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1809 ; X64-SSSE3-NEXT: paddd %mm0, %mm0
1810 ; X64-SSSE3-NEXT: movq %mm0, (%rdi)
1811 ; X64-SSSE3-NEXT: retq
1812 ;
1813 ; X64-AVX1-LABEL: build_v2f32_00:
1814 ; X64-AVX1: # %bb.0:
1815 ; X64-AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
1816 ; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1817 ; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1818 ; X64-AVX1-NEXT: paddd %mm0, %mm0
1819 ; X64-AVX1-NEXT: movq %mm0, (%rdi)
1820 ; X64-AVX1-NEXT: retq
1821 ;
1822 ; X64-AVX2-LABEL: build_v2f32_00:
1823 ; X64-AVX2: # %bb.0:
1824 ; X64-AVX2-NEXT: vbroadcastss %xmm0, %xmm0
1825 ; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1826 ; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1827 ; X64-AVX2-NEXT: paddd %mm0, %mm0
1828 ; X64-AVX2-NEXT: movq %mm0, (%rdi)
1829 ; X64-AVX2-NEXT: retq
1830 ;
1831 ; X64-AVX512-LABEL: build_v2f32_00:
1832 ; X64-AVX512: # %bb.0:
1833 ; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0
1834 ; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1835 ; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
1836 ; X64-AVX512-NEXT: paddd %mm0, %mm0
1837 ; X64-AVX512-NEXT: movq %mm0, (%rdi)
1838 ; X64-AVX512-NEXT: retq
798 ; X86-SSE-LABEL: build_v2f32_00:
799 ; X86-SSE: # %bb.0:
800 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
801 ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
802 ; X86-SSE-NEXT: movdq2q %xmm0, %mm0
803 ; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
804 ; X86-SSE-NEXT: paddd %mm0, %mm0
805 ; X86-SSE-NEXT: movq %mm0, (%eax)
806 ; X86-SSE-NEXT: retl
807 ;
808 ; X64-LABEL: build_v2f32_00:
809 ; X64: # %bb.0:
810 ; X64-NEXT: movdq2q %xmm0, %mm0
811 ; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
812 ; X64-NEXT: paddd %mm0, %mm0
813 ; X64-NEXT: movq %mm0, (%rdi)
814 ; X64-NEXT: retq
1839815 %1 = insertelement <2 x float> undef, float %a0, i32 0
1840816 %2 = insertelement <2 x float> %1, float %a0, i32 1
1841817 %3 = bitcast <2 x float> %2 to x86_mmx
99 ; X86-SSE-NEXT: pushl %ebp
1010 ; X86-SSE-NEXT: movl %esp, %ebp
1111 ; X86-SSE-NEXT: andl $-8, %esp
12 ; X86-SSE-NEXT: subl $16, %esp
13 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
14 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
15 ; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
16 ; X86-SSE-NEXT: movq {{[0-9]+}}(%esp), %mm0
12 ; X86-SSE-NEXT: subl $8, %esp
13 ; X86-SSE-NEXT: movd 8(%ebp), %mm0
14 ; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
1715 ; X86-SSE-NEXT: packsswb %mm0, %mm0
1816 ; X86-SSE-NEXT: movq %mm0, (%esp)
1917 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2826 ; X86-AVX-NEXT: pushl %ebp
2927 ; X86-AVX-NEXT: movl %esp, %ebp
3028 ; X86-AVX-NEXT: andl $-8, %esp
31 ; X86-AVX-NEXT: subl $16, %esp
32 ; X86-AVX-NEXT: vbroadcastss 8(%ebp), %xmm0
33 ; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
34 ; X86-AVX-NEXT: movq {{[0-9]+}}(%esp), %mm0
29 ; X86-AVX-NEXT: subl $8, %esp
30 ; X86-AVX-NEXT: movd 8(%ebp), %mm0
31 ; X86-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
3532 ; X86-AVX-NEXT: packsswb %mm0, %mm0
3633 ; X86-AVX-NEXT: movq %mm0, (%esp)
3734 ; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4340 ;
4441 ; X64-SSE-LABEL: PR29222:
4542 ; X64-SSE: # %bb.0:
46 ; X64-SSE-NEXT: movd %edi, %xmm0
47 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
48 ; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
49 ; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
43 ; X64-SSE-NEXT: movd %edi, %mm0
44 ; X64-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
5045 ; X64-SSE-NEXT: packsswb %mm0, %mm0
5146 ; X64-SSE-NEXT: movq2dq %mm0, %xmm0
5247 ; X64-SSE-NEXT: packsswb %xmm0, %xmm0
5550 ;
5651 ; X64-AVX-LABEL: PR29222:
5752 ; X64-AVX: # %bb.0:
58 ; X64-AVX-NEXT: vmovd %edi, %xmm0
59 ; X64-AVX-NEXT: vpbroadcastd %xmm0, %xmm0
60 ; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
61 ; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
53 ; X64-AVX-NEXT: movd %edi, %mm0
54 ; X64-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
6255 ; X64-AVX-NEXT: packsswb %mm0, %mm0
6356 ; X64-AVX-NEXT: movq2dq %mm0, %xmm0
6457 ; X64-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
55 define x86_mmx @t0(i32 %A) nounwind {
66 ; X32-LABEL: t0:
77 ; X32: ## %bb.0:
8 ; X32-NEXT: subl $12, %esp
9 ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
10 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
11 ; X32-NEXT: movq %xmm0, (%esp)
12 ; X32-NEXT: movq (%esp), %mm0
13 ; X32-NEXT: addl $12, %esp
8 ; X32-NEXT: movd {{[0-9]+}}(%esp), %mm1
9 ; X32-NEXT: pxor %mm0, %mm0
10 ; X32-NEXT: punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0]
1411 ; X32-NEXT: retl
1512 ;
1613 ; X64-LABEL: t0: