llvm.org GIT mirror llvm / 895148a
[DAGCombiner] narrow vector binops when extraction is cheap Narrowing vector binops came up in the demanded bits discussion in D52912. I don't think we're going to be able to do this transform in IR as a canonicalization because of the risk of creating unsupported widths for vector ops, but we already have a DAG TLI hook to allow what I was hoping for: isExtractSubvectorCheap(). This is currently enabled for x86, ARM, and AArch64 (although only x86 has existing regression test diffs). This is artificially limited to not look through bitcasts because there are so many test diffs already, but that's marked with a TODO and is a small follow-up. Differential Revision: https://reviews.llvm.org/D53784 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@345602 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 1 year, 3 months ago
24 changed file(s) with 392 addition(s) and 473 deletion(s). Raw diff Collapse all Expand all
1667216672 return SDValue();
1667316673 }
1667416674
16675 /// If we are extracting a subvector produced by a wide binary operator with at
16676 /// at least one operand that was the result of a vector concatenation, then try
16677 /// to use the narrow vector operands directly to avoid the concatenation and
16678 /// extraction.
16675 /// If we are extracting a subvector produced by a wide binary operator try
16676 /// to use a narrow binary operator and/or avoid concatenation and extraction.
1667916677 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
1668016678 // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
1668116679 // some of these bailouts with other transforms.
1669616694 if (!WideBVT.isVector())
1669716695 return SDValue();
1669816696
16699 // Bail out if the target does not support a narrower version of the binop.
16700 unsigned BOpcode = BinOp.getOpcode();
16701 EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
16702 WideBVT.getVectorNumElements() / 2);
16703 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16704 if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
16705 return SDValue();
16706
16707 // Only handle the case where we are doubling and then halving. A larger ratio
16708 // may require more than two narrow binops to replace the wide binop.
1670916697 EVT VT = Extract->getValueType(0);
1671016698 unsigned NumElems = VT.getVectorNumElements();
1671116699 unsigned ExtractIndex = ExtractIndexC->getZExtValue();
1671216700 assert(ExtractIndex % NumElems == 0 &&
1671316701 "Extract index is not a multiple of the vector length.");
16714 if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
16702 EVT SrcVT = Extract->getOperand(0).getValueType();
16703 unsigned NumSrcElems = SrcVT.getVectorNumElements();
16704 unsigned NarrowingRatio = NumSrcElems / NumElems;
16705
16706 // Bail out if the target does not support a narrower version of the binop.
16707 unsigned BOpcode = BinOp.getOpcode();
16708 unsigned WideNumElts = WideBVT.getVectorNumElements();
16709 EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
16710 WideNumElts / NarrowingRatio);
16711 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16712 if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
16713 return SDValue();
16714
16715 // If extraction is cheap, we don't need to look at the binop operands
16716 // for concat ops. The narrow binop alone makes this transform profitable.
16717 // TODO: We're not dealing with the bitcasted pattern here. That limitation
16718 // should be lifted.
16719 if (Extract->getOperand(0) == BinOp && BinOp.hasOneUse() &&
16720 TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtractIndex)) {
16721 // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
16722 SDLoc DL(Extract);
16723 SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
16724 BinOp.getOperand(0), Extract->getOperand(1));
16725 SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
16726 BinOp.getOperand(1), Extract->getOperand(1));
16727 return DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
16728 BinOp.getNode()->getFlags());
16729 }
16730
16731 // Only handle the case where we are doubling and then halving. A larger ratio
16732 // may require more than two narrow binops to replace the wide binop.
16733 if (NarrowingRatio != 2)
1671516734 return SDValue();
1671616735
1671716736 // TODO: The motivating case for this transform is an x86 AVX1 target. That
66 define void @func(<4 x float> %a, <16 x i8> %b, <16 x i8> %c, <8 x float> %d, <8 x float> %e, <8 x float>* %f) nounwind ssp {
77 ; CHECK-LABEL: func:
88 ; CHECK: ## %bb.0:
9 ; CHECK-NEXT: vmovdqu 0, %xmm3
10 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
11 ; CHECK-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
12 ; CHECK-NEXT: vmovdqu 32, %xmm3
13 ; CHECK-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
14 ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
15 ; CHECK-NEXT: vmulps %ymm0, %ymm0, %ymm0
16 ; CHECK-NEXT: vmulps %ymm1, %ymm1, %ymm1
17 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
18 ; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
9 ; CHECK-NEXT: vmovdqu 0, %xmm0
10 ; CHECK-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
11 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
12 ; CHECK-NEXT: vmulps %xmm1, %xmm1, %xmm1
13 ; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0
14 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
15 ; CHECK-NEXT: vaddps %xmm0, %xmm0, %xmm0
1916 ; CHECK-NEXT: vmulps %xmm0, %xmm0, %xmm0
2017 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
21 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
2218 ; CHECK-NEXT: vaddps %ymm0, %ymm0, %ymm0
2319 ; CHECK-NEXT: vhaddps %ymm4, %ymm0, %ymm0
2420 ; CHECK-NEXT: vsubps %ymm0, %ymm0, %ymm0
25 ; CHECK-NEXT: vhaddps %ymm0, %ymm1, %ymm0
21 ; CHECK-NEXT: vhaddps %ymm0, %ymm2, %ymm0
2622 ; CHECK-NEXT: vmovaps %ymm0, (%rdi)
2723 ; CHECK-NEXT: vzeroupper
2824 ; CHECK-NEXT: retq
337337 define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
338338 ; AVX1-LABEL: andn_disguised_i8_elts:
339339 ; AVX1: # %bb.0:
340 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
341 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
342 ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
340 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm3
341 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
342 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
343343 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
344344 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
345345 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
346 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
347 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0
346348 ; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1
347 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
348 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
349 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
350 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
349 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
350 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
351351 ; AVX1-NEXT: retq
352352 ;
353353 ; INT256-LABEL: andn_disguised_i8_elts:
416416 define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
417417 ; AVX1-LABEL: andn_variable_mask_operand_concat:
418418 ; AVX1: # %bb.0:
419 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
420 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
421 ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
422 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
423 ; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0
419 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4
420 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
421 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
422 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
424423 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
425 ; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1
426 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
427 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
428 ; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
429 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
424 ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
425 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
426 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
427 ; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm1
428 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
429 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
430430 ; AVX1-NEXT: retq
431431 ;
432432 ; INT256-LABEL: andn_variable_mask_operand_concat:
9595 define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
9696 ; VZ-LABEL: test02:
9797 ; VZ: # %bb.0:
98 ; VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0
99 ; VZ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
98 ; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0
10099 ; VZ-NEXT: vzeroupper
101100 ; VZ-NEXT: jmp do_sse # TAILCALL
102101 ;
103102 ; FAST-ymm-zmm-LABEL: test02:
104103 ; FAST-ymm-zmm: # %bb.0:
105 ; FAST-ymm-zmm-NEXT: vaddps %ymm1, %ymm0, %ymm0
106 ; FAST-ymm-zmm-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
104 ; FAST-ymm-zmm-NEXT: vaddps %xmm1, %xmm0, %xmm0
107105 ; FAST-ymm-zmm-NEXT: jmp do_sse # TAILCALL
108106 ;
109107 ; BDVER2-LABEL: test02:
110108 ; BDVER2: # %bb.0:
111 ; BDVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0
112 ; BDVER2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
109 ; BDVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0
113110 ; BDVER2-NEXT: vzeroupper
114111 ; BDVER2-NEXT: jmp do_sse # TAILCALL
115112 ;
116113 ; BTVER2-LABEL: test02:
117114 ; BTVER2: # %bb.0:
118 ; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0
119 ; BTVER2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
115 ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0
120116 ; BTVER2-NEXT: jmp do_sse # TAILCALL
121117 %add.i = fadd <8 x float> %a, %b
122118 %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
77 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
88 ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
99 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
10 ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
10 ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1111 ; KNL-NEXT: vmovd %xmm0, %eax
1212 ; KNL-NEXT: retq
1313 ;
1616 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1717 ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1818 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
19 ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
19 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2020 ; SKX-NEXT: vmovd %xmm0, %eax
2121 ; SKX-NEXT: vzeroupper
2222 ; SKX-NEXT: retq
3434 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
3535 ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
3636 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
37 ; KNL-NEXT: vpsubd %zmm1, %zmm0, %zmm0
37 ; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
3838 ; KNL-NEXT: vmovd %xmm0, %eax
3939 ; KNL-NEXT: retq
4040 ;
4343 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
4444 ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
4545 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
46 ; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
46 ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
4747 ; SKX-NEXT: vmovd %xmm0, %eax
4848 ; SKX-NEXT: vzeroupper
4949 ; SKX-NEXT: retq
6161 ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
6262 ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
6363 ; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
64 ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
65 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
64 ; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
6665 ; KNL-NEXT: retq
6766 ;
6867 ; SKX-LABEL: fhadd_16:
7069 ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7170 ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
7271 ; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
73 ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
74 ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
72 ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
7573 ; SKX-NEXT: vzeroupper
7674 ; SKX-NEXT: retq
7775 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32>
8886 ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8987 ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
9088 ; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
91 ; KNL-NEXT: vsubps %zmm1, %zmm0, %zmm0
92 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
89 ; KNL-NEXT: vsubps %xmm1, %xmm0, %xmm0
9390 ; KNL-NEXT: retq
9491 ;
9592 ; SKX-LABEL: fhsub_16:
9794 ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
9895 ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
9996 ; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
100 ; SKX-NEXT: vsubps %zmm1, %zmm0, %zmm0
101 ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
97 ; SKX-NEXT: vsubps %xmm1, %xmm0, %xmm0
10298 ; SKX-NEXT: vzeroupper
10399 ; SKX-NEXT: retq
104100 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32>
179175 ; KNL: # %bb.0:
180176 ; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
181177 ; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
182 ; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
183 ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
178 ; KNL-NEXT: vaddpd %ymm0, %ymm2, %ymm0
184179 ; KNL-NEXT: retq
185180 ;
186181 ; SKX-LABEL: fadd_noundef_low:
187182 ; SKX: # %bb.0:
188183 ; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
189184 ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
190 ; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
191 ; SKX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
185 ; SKX-NEXT: vaddpd %ymm0, %ymm2, %ymm0
192186 ; SKX-NEXT: retq
193187 %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32>
194188 %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32>
202196 ; KNL: # %bb.0:
203197 ; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
204198 ; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
205 ; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
206199 ; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
200 ; KNL-NEXT: vextractf64x4 $1, %zmm2, %ymm1
201 ; KNL-NEXT: vaddpd %ymm0, %ymm1, %ymm0
207202 ; KNL-NEXT: retq
208203 ;
209204 ; SKX-LABEL: fadd_noundef_high:
210205 ; SKX: # %bb.0:
211206 ; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
212207 ; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
213 ; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
214208 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
209 ; SKX-NEXT: vextractf64x4 $1, %zmm2, %ymm1
210 ; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0
215211 ; SKX-NEXT: retq
216212 %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32>
217213 %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32>
226222 ; KNL: # %bb.0:
227223 ; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
228224 ; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
229 ; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0
230 ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
225 ; KNL-NEXT: vpaddd %ymm0, %ymm2, %ymm0
231226 ; KNL-NEXT: retq
232227 ;
233228 ; SKX-LABEL: hadd_16_3_sv:
234229 ; SKX: # %bb.0:
235230 ; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
236231 ; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
237 ; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0
238 ; SKX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
232 ; SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0
239233 ; SKX-NEXT: retq
240234 %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32>
241235 , i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
252246 ; KNL-LABEL: fadd_noundef_eel:
253247 ; KNL: # %bb.0:
254248 ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
255 ; KNL-NEXT: vaddpd %zmm1, %zmm0, %zmm0
256 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
249 ; KNL-NEXT: vaddpd %xmm1, %xmm0, %xmm0
257250 ; KNL-NEXT: retq
258251 ;
259252 ; SKX-LABEL: fadd_noundef_eel:
260253 ; SKX: # %bb.0:
261254 ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
262 ; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0
263 ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
255 ; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
264256 ; SKX-NEXT: vzeroupper
265257 ; SKX-NEXT: retq
266258 %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32>
276268 ; KNL-LABEL: fsub_noundef_ee:
277269 ; KNL: # %bb.0:
278270 ; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm0
279 ; KNL-NEXT: vbroadcastsd %xmm0, %zmm0
280 ; KNL-NEXT: vsubpd %zmm1, %zmm0, %zmm0
281 ; KNL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
271 ; KNL-NEXT: vbroadcastsd %xmm0, %zmm1
272 ; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm1
273 ; KNL-NEXT: vsubpd %xmm0, %xmm1, %xmm0
282274 ; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
283275 ; KNL-NEXT: retq
284276 ;
285277 ; SKX-LABEL: fsub_noundef_ee:
286278 ; SKX: # %bb.0:
287279 ; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm0
288 ; SKX-NEXT: vbroadcastsd %xmm0, %zmm0
289 ; SKX-NEXT: vsubpd %zmm1, %zmm0, %zmm0
290 ; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0
280 ; SKX-NEXT: vbroadcastsd %xmm0, %zmm1
281 ; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm1
282 ; SKX-NEXT: vsubpd %xmm0, %xmm1, %xmm0
291283 ; SKX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
292284 ; SKX-NEXT: vzeroupper
293285 ; SKX-NEXT: retq
850850 ; SKX-NEXT: kxorb %k2, %k1, %k1
851851 ; SKX-NEXT: kshiftlb $7, %k1, %k1
852852 ; SKX-NEXT: kshiftrb $5, %k1, %k1
853 ; SKX-NEXT: kxorb %k1, %k0, %k0
853 ; SKX-NEXT: kxorw %k1, %k0, %k0
854854 ; SKX-NEXT: kmovd %k0, %eax
855855 ; SKX-NEXT: ## kill: def $al killed $al killed $eax
856856 ; SKX-NEXT: retq
889889 ; SKX-NEXT: kshiftrb $7, %k0, %k0
890890 ; SKX-NEXT: kmovd %eax, %k1
891891 ; SKX-NEXT: kshiftlb $1, %k1, %k1
892 ; SKX-NEXT: korb %k1, %k0, %k0
892 ; SKX-NEXT: korw %k1, %k0, %k0
893893 ; SKX-NEXT: kmovd %k0, %eax
894894 ; SKX-NEXT: ## kill: def $al killed $al killed $eax
895895 ; SKX-NEXT: retq
10181018 ; KNL: ## %bb.0:
10191019 ; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
10201020 ; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
1021 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
10211022 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
1022 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
10231023 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
10241024 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
10251025 ; KNL-NEXT: kshiftrw $15, %k0, %k0
10531053 ; KNL: ## %bb.0:
10541054 ; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
10551055 ; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
1056 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
10561057 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
1057 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
10581058 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
10591059 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
10601060 ; KNL-NEXT: kshiftrw $15, %k0, %k0
7272 ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
7373 ; CHECK-NEXT: vpmovq2m %xmm0, %k1
7474 ; CHECK-NEXT: kshiftlb $2, %k0, %k0
75 ; CHECK-NEXT: korb %k0, %k1, %k0
75 ; CHECK-NEXT: korw %k0, %k1, %k0
7676 ; CHECK-NEXT: vpmovm2d %k0, %xmm0
7777 ; CHECK-NEXT: retq
7878
8888 ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
8989 ; CHECK-NEXT: vpmovq2m %xmm0, %k1
9090 ; CHECK-NEXT: kshiftlb $2, %k0, %k0
91 ; CHECK-NEXT: korb %k0, %k1, %k0
91 ; CHECK-NEXT: korw %k0, %k1, %k0
9292 ; CHECK-NEXT: vpmovm2b %k0, %xmm0
9393 ; CHECK-NEXT: retq
9494
239239 define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
240240 ; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
241241 ; X32: # %bb.0:
242 ; X32-NEXT: vpsrad $16, %xmm0, %xmm1
243 ; X32-NEXT: vpsrlq $16, %xmm0, %xmm0
244 ; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
245 ; X32-NEXT: vpsrlq $16, %xmm0, %xmm0
246 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
242 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
247243 ; X32-NEXT: vcvtdq2pd %xmm0, %xmm0
248244 ; X32-NEXT: retl
249245 ;
250246 ; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
251247 ; X64: # %bb.0:
252 ; X64-NEXT: vpsrad $16, %xmm0, %xmm1
253 ; X64-NEXT: vpsrlq $16, %xmm0, %xmm0
254 ; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
255 ; X64-NEXT: vpsrlq $16, %xmm0, %xmm0
256 ; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
248 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
257249 ; X64-NEXT: vcvtdq2pd %xmm0, %xmm0
258250 ; X64-NEXT: retq
259251 %1 = ashr <2 x i64> %a0,
155155 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
156156 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
157157 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
158 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
158 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
159159 ; AVX256-NEXT: vmovd %xmm0, %eax
160160 ; AVX256-NEXT: vzeroupper
161161 ; AVX256-NEXT: retq
282282 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
283283 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
284284 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
285 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
285 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
286286 ; AVX2-NEXT: vmovd %xmm0, %eax
287287 ; AVX2-NEXT: vzeroupper
288288 ; AVX2-NEXT: retq
309309 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
310310 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
311311 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
312 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
312 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
313313 ; AVX512-NEXT: vmovd %xmm0, %eax
314314 ; AVX512-NEXT: vzeroupper
315315 ; AVX512-NEXT: retq
475475 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
476476 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
477477 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
478 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
478 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
479479 ; AVX2-NEXT: vmovd %xmm0, %eax
480480 ; AVX2-NEXT: vzeroupper
481481 ; AVX2-NEXT: retq
507507 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
508508 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
509509 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
510 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
510 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
511511 ; AVX512F-NEXT: vmovd %xmm0, %eax
512512 ; AVX512F-NEXT: vzeroupper
513513 ; AVX512F-NEXT: retq
536536 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
537537 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
538538 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
539 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
539 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
540540 ; AVX512BW-NEXT: vmovd %xmm0, %eax
541541 ; AVX512BW-NEXT: vzeroupper
542542 ; AVX512BW-NEXT: retq
738738 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
739739 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
740740 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
741 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
741 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
742742 ; AVX256-NEXT: vmovd %xmm0, %eax
743743 ; AVX256-NEXT: vzeroupper
744744 ; AVX256-NEXT: retq
874874 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
875875 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
876876 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
877 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
877 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
878878 ; AVX2-NEXT: vmovd %xmm0, %eax
879879 ; AVX2-NEXT: vzeroupper
880880 ; AVX2-NEXT: retq
902902 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
903903 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
904904 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
905 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
905 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
906906 ; AVX512-NEXT: vmovd %xmm0, %eax
907907 ; AVX512-NEXT: vzeroupper
908908 ; AVX512-NEXT: retq
10861086 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
10871087 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
10881088 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1089 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1089 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
10901090 ; AVX2-NEXT: vmovd %xmm0, %eax
10911091 ; AVX2-NEXT: vzeroupper
10921092 ; AVX2-NEXT: retq
11201120 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
11211121 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
11221122 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1123 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1123 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
11241124 ; AVX512F-NEXT: vmovd %xmm0, %eax
11251125 ; AVX512F-NEXT: vzeroupper
11261126 ; AVX512F-NEXT: retq
11501150 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
11511151 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
11521152 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1153 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1153 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
11541154 ; AVX512BW-NEXT: vmovd %xmm0, %eax
11551155 ; AVX512BW-NEXT: vzeroupper
11561156 ; AVX512BW-NEXT: retq
13541354 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
13551355 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
13561356 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1357 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1357 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
13581358 ; AVX256-NEXT: vmovd %xmm0, %eax
13591359 ; AVX256-NEXT: vzeroupper
13601360 ; AVX256-NEXT: retq
15091509 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
15101510 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
15111511 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1512 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1512 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15131513 ; AVX2-NEXT: vmovd %xmm0, %eax
15141514 ; AVX2-NEXT: vzeroupper
15151515 ; AVX2-NEXT: retq
15371537 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
15381538 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
15391539 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1540 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1540 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15411541 ; AVX512-NEXT: vmovd %xmm0, %eax
15421542 ; AVX512-NEXT: vzeroupper
15431543 ; AVX512-NEXT: retq
17621762 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
17631763 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
17641764 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1765 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1765 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
17661766 ; AVX2-NEXT: vmovd %xmm0, %eax
17671767 ; AVX2-NEXT: vzeroupper
17681768 ; AVX2-NEXT: retq
17961796 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
17971797 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
17981798 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1799 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1799 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
18001800 ; AVX512-NEXT: vmovd %xmm0, %eax
18011801 ; AVX512-NEXT: vzeroupper
18021802 ; AVX512-NEXT: retq
27292729 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
27302730 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
27312731 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
2732 ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
2732 ; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
27332733 ; AVX256-NEXT: vmovd %xmm0, %eax
27342734 ; AVX256-NEXT: vzeroupper
27352735 ; AVX256-NEXT: retq
190190 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
191191 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
192192 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
193 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
193 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
194194 ; CHECK-NEXT: vmovd %xmm0, %eax
195195 ; CHECK-NEXT: vzeroupper
196196 ; CHECK-NEXT: retq
256256 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
257257 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
258258 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
259 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
259 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
260260 ; CHECK-NEXT: vmovd %xmm0, %eax
261261 ; CHECK-NEXT: vzeroupper
262262 ; CHECK-NEXT: retq
320320 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
321321 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
322322 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
323 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
323 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
324324 ; CHECK-NEXT: vmovd %xmm0, %eax
325325 ; CHECK-NEXT: vzeroupper
326326 ; CHECK-NEXT: retq
382382 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
383383 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
384384 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
385 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
385 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
386386 ; CHECK-NEXT: vmovd %xmm0, %eax
387387 ; CHECK-NEXT: vzeroupper
388388 ; CHECK-NEXT: retq
8181 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8282 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
8383 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
84 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
84 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
8585 ; AVX2-NEXT: vmovd %xmm0, %eax
8686 ; AVX2-NEXT: vzeroupper
8787 ; AVX2-NEXT: retq
106106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
107107 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
108108 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
109 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
109 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
110110 ; AVX512-NEXT: vmovd %xmm0, %eax
111111 ; AVX512-NEXT: vzeroupper
112112 ; AVX512-NEXT: retq
346346 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
347347 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
348348 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
349 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
349 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
350350 ; AVX2-NEXT: vmovd %xmm0, %eax
351351 ; AVX2-NEXT: vzeroupper
352352 ; AVX2-NEXT: retq
373373 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
374374 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
375375 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
376 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
376 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
377377 ; AVX512-NEXT: vmovd %xmm0, %eax
378378 ; AVX512-NEXT: vzeroupper
379379 ; AVX512-NEXT: retq
940940 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
941941 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
942942 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
943 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
943 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
944944 ; AVX2-NEXT: vmovd %xmm0, %eax
945945 ; AVX2-NEXT: vzeroupper
946946 ; AVX2-NEXT: retq
988988 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
989989 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
990990 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
991 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
991 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
992992 ; AVX512F-NEXT: vmovd %xmm0, %eax
993993 ; AVX512F-NEXT: vzeroupper
994994 ; AVX512F-NEXT: retq
10171017 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
10181018 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
10191019 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1020 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1020 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
10211021 ; AVX512BW-NEXT: vmovd %xmm0, %eax
10221022 ; AVX512BW-NEXT: vzeroupper
10231023 ; AVX512BW-NEXT: retq
14551455 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
14561456 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
14571457 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1458 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1458 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
14591459 ; AVX2-NEXT: vmovd %xmm0, %eax
14601460 ; AVX2-NEXT: vzeroupper
14611461 ; AVX2-NEXT: retq
14771477 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
14781478 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
14791479 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1480 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1480 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
14811481 ; AVX512-NEXT: vmovd %xmm0, %eax
14821482 ; AVX512-NEXT: vzeroupper
14831483 ; AVX512-NEXT: retq
15571557 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
15581558 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
15591559 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1560 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
1560 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15611561 ; AVX2-NEXT: vmovd %xmm0, %eax
15621562 ; AVX2-NEXT: vzeroupper
15631563 ; AVX2-NEXT: retq
15761576 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
15771577 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
15781578 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1579 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1579 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15801580 ; AVX512-NEXT: vmovd %xmm0, %eax
15811581 ; AVX512-NEXT: vzeroupper
15821582 ; AVX512-NEXT: retq
24742474 ; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
24752475 ; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007
24762476 ; X86-AVX2-NEXT: vmovd %eax, %xmm2
2477 ; X86-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2477 ; X86-AVX2-NEXT: vpmulld %xmm2, %xmm0, %xmm0
24782478 ; X86-AVX2-NEXT: vmovd %xmm0, (%eax)
24792479 ; X86-AVX2-NEXT: vmovdqa %ymm1, (%eax)
24802480 ; X86-AVX2-NEXT: popl %esi
27222722 ; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1
27232723 ; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007
27242724 ; X64-AVX2-NEXT: vmovd %eax, %xmm2
2725 ; X64-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
2725 ; X64-AVX2-NEXT: vpmulld %xmm2, %xmm0, %xmm0
27262726 ; X64-AVX2-NEXT: vmovd %xmm0, (%rax)
27272727 ; X64-AVX2-NEXT: vmovdqa %ymm1, (%rax)
27282728 ; X64-AVX2-NEXT: vzeroupper
662662 ; SSE41-NEXT: addpd %xmm1, %xmm0
663663 ; SSE41-NEXT: retq
664664 ;
665 ; AVX1-LABEL: uitofp_4i32_to_2f64:
666 ; AVX1: # %bb.0:
667 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
668 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
669 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
670 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
671 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
672 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
673 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
674 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
675 ; AVX1-NEXT: vzeroupper
676 ; AVX1-NEXT: retq
677 ;
678 ; AVX2-LABEL: uitofp_4i32_to_2f64:
679 ; AVX2: # %bb.0:
680 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
681 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
682 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
683 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
684 ; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
685 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
686 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
687 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
688 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
689 ; AVX2-NEXT: vzeroupper
690 ; AVX2-NEXT: retq
665 ; VEX-LABEL: uitofp_4i32_to_2f64:
666 ; VEX: # %bb.0:
667 ; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
668 ; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
669 ; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
670 ; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
671 ; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
672 ; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
673 ; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
674 ; VEX-NEXT: retq
691675 ;
692676 ; AVX512F-LABEL: uitofp_4i32_to_2f64:
693677 ; AVX512F: # %bb.0:
6363 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
6464 ; AVX512-NEXT: vandpd %ymm1, %ymm0, %ymm0
6565 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
66 ; AVX512-NEXT: vandpd %ymm1, %ymm0, %ymm0
66 ; AVX512-NEXT: vandpd %xmm1, %xmm0, %xmm0
6767 ; AVX512-NEXT: vmovq %xmm0, %rax
6868 ; AVX512-NEXT: vzeroupper
6969 ; AVX512-NEXT: retq
199199 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
200200 ; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
201201 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
202 ; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
202 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0
203203 ; AVX512-NEXT: vmovd %xmm0, %eax
204204 ; AVX512-NEXT: vzeroupper
205205 ; AVX512-NEXT: retq
343343 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
344344 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
345345 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
346 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
346 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
347347 ; AVX512-NEXT: vmovq %xmm0, %rax
348348 ; AVX512-NEXT: vzeroupper
349349 ; AVX512-NEXT: retq
509509 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
510510 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
511511 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
512 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
512 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
513513 ; AVX512-NEXT: vmovd %xmm0, %eax
514514 ; AVX512-NEXT: vzeroupper
515515 ; AVX512-NEXT: retq
667667 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
668668 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
669669 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
670 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
670 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
671671 ; AVX1-NEXT: vmovd %xmm0, %eax
672672 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
673673 ; AVX1-NEXT: vzeroupper
694694 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
695695 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
696696 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
697 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
697 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
698698 ; AVX512-NEXT: vmovd %xmm0, %eax
699699 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
700700 ; AVX512-NEXT: vzeroupper
869869 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
870870 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
871871 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
872 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
872 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
873873 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
874874 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
875875 ; AVX1-NEXT: vzeroupper
898898 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
899899 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
900900 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
901 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
901 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
902902 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
903903 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
904904 ; AVX512-NEXT: vzeroupper
6161 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
6262 ; AVX512-NEXT: vorpd %ymm1, %ymm0, %ymm0
6363 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
64 ; AVX512-NEXT: vorpd %ymm1, %ymm0, %ymm0
64 ; AVX512-NEXT: vorpd %xmm1, %xmm0, %xmm0
6565 ; AVX512-NEXT: vmovq %xmm0, %rax
6666 ; AVX512-NEXT: vzeroupper
6767 ; AVX512-NEXT: retq
187187 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
188188 ; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0
189189 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
190 ; AVX512-NEXT: vorps %ymm1, %ymm0, %ymm0
190 ; AVX512-NEXT: vorps %xmm1, %xmm0, %xmm0
191191 ; AVX512-NEXT: vmovd %xmm0, %eax
192192 ; AVX512-NEXT: vzeroupper
193193 ; AVX512-NEXT: retq
323323 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
324324 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
325325 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
326 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
326 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
327327 ; AVX512-NEXT: vmovq %xmm0, %rax
328328 ; AVX512-NEXT: vzeroupper
329329 ; AVX512-NEXT: retq
476476 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
477477 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
478478 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
479 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
479 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
480480 ; AVX512-NEXT: vmovd %xmm0, %eax
481481 ; AVX512-NEXT: vzeroupper
482482 ; AVX512-NEXT: retq
622622 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
623623 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
624624 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
625 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
625 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
626626 ; AVX1-NEXT: vmovd %xmm0, %eax
627627 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
628628 ; AVX1-NEXT: vzeroupper
648648 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
649649 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
650650 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
651 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
651 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
652652 ; AVX512-NEXT: vmovd %xmm0, %eax
653653 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
654654 ; AVX512-NEXT: vzeroupper
811811 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
812812 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
813813 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
814 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
814 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
815815 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
816816 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
817817 ; AVX1-NEXT: vzeroupper
839839 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
840840 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
841841 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
842 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
842 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
843843 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
844844 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
845845 ; AVX512-NEXT: vzeroupper
5858 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5959 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6060 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
61 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
61 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6262 ; AVX2-NEXT: vmovq %xmm0, %rax
6363 ; AVX2-NEXT: vzeroupper
6464 ; AVX2-NEXT: retq
6868 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
6969 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
7070 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
71 ; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
71 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
7272 ; AVX512-NEXT: vmovq %xmm0, %rax
7373 ; AVX512-NEXT: vzeroupper
7474 ; AVX512-NEXT: retq
106106 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
107107 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
108108 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
109 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
109 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
110110 ; AVX2-NEXT: vmovq %xmm0, %rax
111111 ; AVX2-NEXT: vzeroupper
112112 ; AVX2-NEXT: retq
118118 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
119119 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
120120 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
121 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
121 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
122122 ; AVX512-NEXT: vmovq %xmm0, %rax
123123 ; AVX512-NEXT: vzeroupper
124124 ; AVX512-NEXT: retq
168168 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
169169 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
170170 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
171 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
171 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
172172 ; AVX2-NEXT: vmovq %xmm0, %rax
173173 ; AVX2-NEXT: vzeroupper
174174 ; AVX2-NEXT: retq
181181 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
182182 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
183183 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
184 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
184 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
185185 ; AVX512-NEXT: vmovq %xmm0, %rax
186186 ; AVX512-NEXT: vzeroupper
187187 ; AVX512-NEXT: retq
254254 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
255255 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
256256 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
257 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
257 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
258258 ; AVX2-NEXT: vmovd %xmm0, %eax
259259 ; AVX2-NEXT: vzeroupper
260260 ; AVX2-NEXT: retq
266266 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
267267 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
268268 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
269 ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
269 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
270270 ; AVX512-NEXT: vmovd %xmm0, %eax
271271 ; AVX512-NEXT: vzeroupper
272272 ; AVX512-NEXT: retq
310310 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
311311 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
312312 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
313 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
313 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
314314 ; AVX2-NEXT: vmovd %xmm0, %eax
315315 ; AVX2-NEXT: vzeroupper
316316 ; AVX2-NEXT: retq
324324 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
325325 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
326326 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
327 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
327 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
328328 ; AVX512-NEXT: vmovd %xmm0, %eax
329329 ; AVX512-NEXT: vzeroupper
330330 ; AVX512-NEXT: retq
380380 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
381381 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
382382 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
383 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
383 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
384384 ; AVX2-NEXT: vmovd %xmm0, %eax
385385 ; AVX2-NEXT: vzeroupper
386386 ; AVX2-NEXT: retq
395395 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
396396 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
397397 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
398 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
398 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
399399 ; AVX512-NEXT: vmovd %xmm0, %eax
400400 ; AVX512-NEXT: vzeroupper
401401 ; AVX512-NEXT: retq
487487 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
488488 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
489489 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
490 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
490 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
491491 ; AVX2-NEXT: vmovd %xmm0, %eax
492492 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
493493 ; AVX2-NEXT: vzeroupper
502502 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
503503 ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
504504 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
505 ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
505 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
506506 ; AVX512-NEXT: vmovd %xmm0, %eax
507507 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
508508 ; AVX512-NEXT: vzeroupper
556556 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
557557 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
558558 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
559 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
559 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
560560 ; AVX2-NEXT: vmovd %xmm0, %eax
561561 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
562562 ; AVX2-NEXT: vzeroupper
573573 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
574574 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
575575 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
576 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
576 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
577577 ; AVX512-NEXT: vmovd %xmm0, %eax
578578 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
579579 ; AVX512-NEXT: vzeroupper
639639 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
640640 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
641641 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
642 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
642 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
643643 ; AVX2-NEXT: vmovd %xmm0, %eax
644644 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
645645 ; AVX2-NEXT: vzeroupper
657657 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
658658 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
659659 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
660 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
660 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
661661 ; AVX512-NEXT: vmovd %xmm0, %eax
662662 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
663663 ; AVX512-NEXT: vzeroupper
797797 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
798798 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
799799 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
800 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
800 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
801801 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
802802 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
803803 ; AVX2-NEXT: vzeroupper
814814 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
815815 ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
816816 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
817 ; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
817 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
818818 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
819819 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
820820 ; AVX512-NEXT: vzeroupper
894894 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
895895 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
896896 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
897 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
897 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
898898 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
899899 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
900900 ; AVX2-NEXT: vzeroupper
913913 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
914914 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
915915 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
916 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
916 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
917917 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
918918 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
919919 ; AVX512-NEXT: vzeroupper
10091009 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
10101010 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
10111011 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1012 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
1012 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
10131013 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
10141014 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
10151015 ; AVX2-NEXT: vzeroupper
10291029 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
10301030 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
10311031 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1032 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1032 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
10331033 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
10341034 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
10351035 ; AVX512-NEXT: vzeroupper
4848 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4949 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
5050 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
51 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
51 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
5252 ; AVX1-NEXT: vmovq %xmm0, %rax
5353 ; AVX1-NEXT: vzeroupper
5454 ; AVX1-NEXT: retq
5858 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5959 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
6060 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
61 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
61 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
6262 ; AVX2-NEXT: vmovq %xmm0, %rax
6363 ; AVX2-NEXT: vzeroupper
6464 ; AVX2-NEXT: retq
6868 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
6969 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
7070 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
71 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
71 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
7272 ; AVX512-NEXT: vmovq %xmm0, %rax
7373 ; AVX512-NEXT: vzeroupper
7474 ; AVX512-NEXT: retq
9393 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
9494 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
9595 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
96 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
96 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
9797 ; AVX1-NEXT: vmovq %xmm0, %rax
9898 ; AVX1-NEXT: vzeroupper
9999 ; AVX1-NEXT: retq
104104 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
105105 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
106106 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
107 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
107 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
108108 ; AVX2-NEXT: vmovq %xmm0, %rax
109109 ; AVX2-NEXT: vzeroupper
110110 ; AVX2-NEXT: retq
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117117 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
119 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
121121 ; AVX512-NEXT: vzeroupper
122122 ; AVX512-NEXT: retq
147147 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
148148 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
149149 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
150 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
150 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
151151 ; AVX1-NEXT: vmovq %xmm0, %rax
152152 ; AVX1-NEXT: vzeroupper
153153 ; AVX1-NEXT: retq
160160 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
161161 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
162162 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
163 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
163 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
164164 ; AVX2-NEXT: vmovq %xmm0, %rax
165165 ; AVX2-NEXT: vzeroupper
166166 ; AVX2-NEXT: retq
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174174 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
176 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
178178 ; AVX512-NEXT: vzeroupper
179179 ; AVX512-NEXT: retq
234234 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
235235 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
236236 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
237 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
237 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
238238 ; AVX1-NEXT: vmovd %xmm0, %eax
239239 ; AVX1-NEXT: vzeroupper
240240 ; AVX1-NEXT: retq
246246 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
247247 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
248248 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
249 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
249 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
250250 ; AVX2-NEXT: vmovd %xmm0, %eax
251251 ; AVX2-NEXT: vzeroupper
252252 ; AVX2-NEXT: retq
258258 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
259259 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
260260 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
261 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
261 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
262262 ; AVX512-NEXT: vmovd %xmm0, %eax
263263 ; AVX512-NEXT: vzeroupper
264264 ; AVX512-NEXT: retq
287287 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
288288 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
289289 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
290 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
290 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
291291 ; AVX1-NEXT: vmovd %xmm0, %eax
292292 ; AVX1-NEXT: vzeroupper
293293 ; AVX1-NEXT: retq
300300 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
301301 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
302302 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
303 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
303 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
304304 ; AVX2-NEXT: vmovd %xmm0, %eax
305305 ; AVX2-NEXT: vzeroupper
306306 ; AVX2-NEXT: retq
314314 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
315315 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
316316 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
317 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
317 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
318318 ; AVX512-NEXT: vmovd %xmm0, %eax
319319 ; AVX512-NEXT: vzeroupper
320320 ; AVX512-NEXT: retq
349349 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
350350 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
351351 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
352 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
352 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
353353 ; AVX1-NEXT: vmovd %xmm0, %eax
354354 ; AVX1-NEXT: vzeroupper
355355 ; AVX1-NEXT: retq
364364 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
365365 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
366366 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
367 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
367 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
368368 ; AVX2-NEXT: vmovd %xmm0, %eax
369369 ; AVX2-NEXT: vzeroupper
370370 ; AVX2-NEXT: retq
379379 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
380380 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
381381 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
382 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
382 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
383383 ; AVX512-NEXT: vmovd %xmm0, %eax
384384 ; AVX512-NEXT: vzeroupper
385385 ; AVX512-NEXT: retq
456456 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
457457 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
458458 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
459 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
459 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
460460 ; AVX1-NEXT: vmovd %xmm0, %eax
461461 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
462462 ; AVX1-NEXT: vzeroupper
471471 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
472472 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
473473 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
474 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
474 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
475475 ; AVX2-NEXT: vmovd %xmm0, %eax
476476 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
477477 ; AVX2-NEXT: vzeroupper
486486 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
487487 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
488488 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
489 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
489 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
490490 ; AVX512-NEXT: vmovd %xmm0, %eax
491491 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
492492 ; AVX512-NEXT: vzeroupper
522522 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
523523 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
524524 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
525 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
525 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
526526 ; AVX1-NEXT: vmovd %xmm0, %eax
527527 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
528528 ; AVX1-NEXT: vzeroupper
538538 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
539539 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
540540 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
541 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
541 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
542542 ; AVX2-NEXT: vmovd %xmm0, %eax
543543 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
544544 ; AVX2-NEXT: vzeroupper
555555 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
556556 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
557557 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
558 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
558 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
559559 ; AVX512-NEXT: vmovd %xmm0, %eax
560560 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
561561 ; AVX512-NEXT: vzeroupper
597597 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
598598 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
599599 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
600 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
600 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
601601 ; AVX1-NEXT: vmovd %xmm0, %eax
602602 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
603603 ; AVX1-NEXT: vzeroupper
615615 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
616616 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
617617 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
618 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
618 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
619619 ; AVX2-NEXT: vmovd %xmm0, %eax
620620 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
621621 ; AVX2-NEXT: vzeroupper
633633 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
634634 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
635635 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
636 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
636 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
637637 ; AVX512-NEXT: vmovd %xmm0, %eax
638638 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
639639 ; AVX512-NEXT: vzeroupper
756756 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
757757 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
758758 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
759 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
759 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
760760 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
761761 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
762762 ; AVX1-NEXT: vzeroupper
773773 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
774774 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
775775 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
776 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
776 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
777777 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
778778 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
779779 ; AVX2-NEXT: vzeroupper
790790 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
791791 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
792792 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
793 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
793 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
794794 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
795795 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
796796 ; AVX512-NEXT: vzeroupper
850850 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
851851 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
852852 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
853 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
853 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
854854 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
855855 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
856856 ; AVX1-NEXT: vzeroupper
868868 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
869869 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
870870 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
871 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
871 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
872872 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
873873 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
874874 ; AVX2-NEXT: vzeroupper
887887 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
888888 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
889889 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
890 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
890 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
891891 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
892892 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
893893 ; AVX512-NEXT: vzeroupper
957957 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
958958 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
959959 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
960 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
960 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
961961 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
962962 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
963963 ; AVX1-NEXT: vzeroupper
977977 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
978978 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
979979 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
980 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
980 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
981981 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
982982 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
983983 ; AVX2-NEXT: vzeroupper
997997 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
998998 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
999999 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1000 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1000 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
10011001 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
10021002 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
10031003 ; AVX512-NEXT: vzeroupper
106106 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
107107 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
108108 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
109 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
110 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
109 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
111110 ; AVX-NEXT: vzeroupper
112111 ; AVX-NEXT: retq
113112 ;
118117 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
119118 ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
120119 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
121 ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
122 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
120 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
123121 ; AVX512-NEXT: vzeroupper
124122 ; AVX512-NEXT: retq
125123 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
160158 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
161159 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
162160 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
163 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
164 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
161 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
165162 ; AVX-NEXT: vzeroupper
166163 ; AVX-NEXT: retq
167164 ;
174171 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
175172 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
176173 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
177 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
178 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
174 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
179175 ; AVX512-NEXT: vzeroupper
180176 ; AVX512-NEXT: retq
181177 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
286282 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
287283 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
288284 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
289 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
290 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
285 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
291286 ; AVX-NEXT: vzeroupper
292287 ; AVX-NEXT: retq
293288 ;
298293 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
299294 ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
300295 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
301 ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
302 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
296 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
303297 ; AVX512-NEXT: vzeroupper
304298 ; AVX512-NEXT: retq
305299 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
341335 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
342336 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
343337 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
344 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
345 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
338 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
346339 ; AVX-NEXT: vzeroupper
347340 ; AVX-NEXT: retq
348341 ;
355348 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
356349 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
357350 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
358 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
359 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
351 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
360352 ; AVX512-NEXT: vzeroupper
361353 ; AVX512-NEXT: retq
362354 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
467459 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
468460 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
469461 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
470 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
471 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
462 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
472463 ; AVX-NEXT: vzeroupper
473464 ; AVX-NEXT: retq
474465 ;
479470 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
480471 ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
481472 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
482 ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
483 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
473 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
484474 ; AVX512-NEXT: vzeroupper
485475 ; AVX512-NEXT: retq
486476 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
522512 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
523513 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
524514 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
525 ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
526 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
515 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
527516 ; AVX-NEXT: vzeroupper
528517 ; AVX-NEXT: retq
529518 ;
536525 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
537526 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
538527 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
539 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
540 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
528 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
541529 ; AVX512-NEXT: vzeroupper
542530 ; AVX512-NEXT: retq
543531 %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
585573 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
586574 ; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0
587575 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
588 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
589 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
576 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
590577 ; AVX-NEXT: vzeroupper
591578 ; AVX-NEXT: retq
592579 ;
595582 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
596583 ; AVX512-NEXT: vaddpd %ymm0, %ymm1, %ymm0
597584 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
598 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
599 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
585 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
600586 ; AVX512-NEXT: vzeroupper
601587 ; AVX512-NEXT: retq
602588 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
620606 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
621607 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
622608 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
623 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
624 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
609 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
625610 ; AVX-NEXT: vzeroupper
626611 ; AVX-NEXT: retq
627612 ;
632617 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
633618 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
634619 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
635 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
636 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
620 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
637621 ; AVX512-NEXT: vzeroupper
638622 ; AVX512-NEXT: retq
639623 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
663647 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
664648 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
665649 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
666 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
667 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
650 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
668651 ; AVX-NEXT: vzeroupper
669652 ; AVX-NEXT: retq
670653 ;
676659 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
677660 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
678661 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
679 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
680 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
662 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
681663 ; AVX512-NEXT: vzeroupper
682664 ; AVX512-NEXT: retq
683665 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
727709 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
728710 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
729711 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
730 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
731 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
712 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
732713 ; AVX-NEXT: vzeroupper
733714 ; AVX-NEXT: retq
734715 ;
737718 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
738719 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
739720 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
740 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
741 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
721 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
742722 ; AVX512-NEXT: vzeroupper
743723 ; AVX512-NEXT: retq
744724 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
763743 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
764744 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
765745 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
766 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
767 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
746 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
768747 ; AVX-NEXT: vzeroupper
769748 ; AVX-NEXT: retq
770749 ;
775754 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
776755 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
777756 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
778 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
779 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
757 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
780758 ; AVX512-NEXT: vzeroupper
781759 ; AVX512-NEXT: retq
782760 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
806784 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
807785 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
808786 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
809 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
810 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
787 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
811788 ; AVX-NEXT: vzeroupper
812789 ; AVX-NEXT: retq
813790 ;
819796 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
820797 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
821798 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
822 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
823 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
799 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
824800 ; AVX512-NEXT: vzeroupper
825801 ; AVX512-NEXT: retq
826802 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
870846 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
871847 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
872848 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
873 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
874 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
849 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
875850 ; AVX-NEXT: vzeroupper
876851 ; AVX-NEXT: retq
877852 ;
880855 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
881856 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
882857 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
883 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
884 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
858 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
885859 ; AVX512-NEXT: vzeroupper
886860 ; AVX512-NEXT: retq
887861 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
906880 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
907881 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
908882 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
909 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
910 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
883 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
911884 ; AVX-NEXT: vzeroupper
912885 ; AVX-NEXT: retq
913886 ;
918891 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
919892 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
920893 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
921 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
922 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
894 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
923895 ; AVX512-NEXT: vzeroupper
924896 ; AVX512-NEXT: retq
925897 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
949921 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
950922 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
951923 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
952 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
953 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
924 ; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
954925 ; AVX-NEXT: vzeroupper
955926 ; AVX-NEXT: retq
956927 ;
962933 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
963934 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
964935 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
965 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
966 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
936 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
967937 ; AVX512-NEXT: vzeroupper
968938 ; AVX512-NEXT: retq
969939 %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
106106 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
107107 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
108108 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
109 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
110 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
109 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
111110 ; AVX-NEXT: vzeroupper
112111 ; AVX-NEXT: retq
113112 ;
118117 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
119118 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
120119 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
121 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
122 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
120 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
123121 ; AVX512-NEXT: vzeroupper
124122 ; AVX512-NEXT: retq
125123 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
160158 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
161159 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
162160 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
163 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
164 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
161 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
165162 ; AVX-NEXT: vzeroupper
166163 ; AVX-NEXT: retq
167164 ;
174171 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
175172 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
176173 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
177 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
178 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
174 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
179175 ; AVX512-NEXT: vzeroupper
180176 ; AVX512-NEXT: retq
181177 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
286282 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
287283 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
288284 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
289 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
290 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
285 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
291286 ; AVX-NEXT: vzeroupper
292287 ; AVX-NEXT: retq
293288 ;
298293 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
299294 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
300295 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
301 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
302 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
296 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
303297 ; AVX512-NEXT: vzeroupper
304298 ; AVX512-NEXT: retq
305299 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
341335 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
342336 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
343337 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
344 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
345 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
338 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
346339 ; AVX-NEXT: vzeroupper
347340 ; AVX-NEXT: retq
348341 ;
355348 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
356349 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
357350 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
358 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
359 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
351 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
360352 ; AVX512-NEXT: vzeroupper
361353 ; AVX512-NEXT: retq
362354 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
467459 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
468460 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
469461 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
470 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
471 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
462 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
472463 ; AVX-NEXT: vzeroupper
473464 ; AVX-NEXT: retq
474465 ;
479470 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
480471 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
481472 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
482 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
483 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
473 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
484474 ; AVX512-NEXT: vzeroupper
485475 ; AVX512-NEXT: retq
486476 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
522512 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
523513 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
524514 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
525 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
526 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
515 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
527516 ; AVX-NEXT: vzeroupper
528517 ; AVX-NEXT: retq
529518 ;
536525 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
537526 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
538527 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
539 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
540 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
528 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
541529 ; AVX512-NEXT: vzeroupper
542530 ; AVX512-NEXT: retq
543531 %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
585573 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
586574 ; AVX-NEXT: vmulpd %ymm0, %ymm1, %ymm0
587575 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
588 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
589 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
576 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
590577 ; AVX-NEXT: vzeroupper
591578 ; AVX-NEXT: retq
592579 ;
595582 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
596583 ; AVX512-NEXT: vmulpd %ymm0, %ymm1, %ymm0
597584 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
598 ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0
599 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
585 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
600586 ; AVX512-NEXT: vzeroupper
601587 ; AVX512-NEXT: retq
602588 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
620606 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
621607 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
622608 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
623 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
624 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
609 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
625610 ; AVX-NEXT: vzeroupper
626611 ; AVX-NEXT: retq
627612 ;
632617 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
633618 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
634619 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
635 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
636 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
620 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
637621 ; AVX512-NEXT: vzeroupper
638622 ; AVX512-NEXT: retq
639623 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
663647 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
664648 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
665649 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
666 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
667 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
650 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
668651 ; AVX-NEXT: vzeroupper
669652 ; AVX-NEXT: retq
670653 ;
676659 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
677660 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
678661 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
679 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
680 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
662 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
681663 ; AVX512-NEXT: vzeroupper
682664 ; AVX512-NEXT: retq
683665 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
727709 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
728710 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
729711 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
730 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
731 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
712 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
732713 ; AVX-NEXT: vzeroupper
733714 ; AVX-NEXT: retq
734715 ;
737718 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
738719 ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0
739720 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
740 ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0
741 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
721 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
742722 ; AVX512-NEXT: vzeroupper
743723 ; AVX512-NEXT: retq
744724 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
763743 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
764744 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
765745 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
766 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
767 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
746 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
768747 ; AVX-NEXT: vzeroupper
769748 ; AVX-NEXT: retq
770749 ;
775754 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
776755 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
777756 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
778 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
779 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
757 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
780758 ; AVX512-NEXT: vzeroupper
781759 ; AVX512-NEXT: retq
782760 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
806784 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
807785 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
808786 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
809 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
810 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
787 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
811788 ; AVX-NEXT: vzeroupper
812789 ; AVX-NEXT: retq
813790 ;
819796 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
820797 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
821798 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
822 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
823 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
799 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
824800 ; AVX512-NEXT: vzeroupper
825801 ; AVX512-NEXT: retq
826802 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
870846 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
871847 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
872848 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
873 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
874 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
849 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
875850 ; AVX-NEXT: vzeroupper
876851 ; AVX-NEXT: retq
877852 ;
880855 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
881856 ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0
882857 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
883 ; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0
884 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
858 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
885859 ; AVX512-NEXT: vzeroupper
886860 ; AVX512-NEXT: retq
887861 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
906880 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
907881 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
908882 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
909 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
910 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
883 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
911884 ; AVX-NEXT: vzeroupper
912885 ; AVX-NEXT: retq
913886 ;
918891 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
919892 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
920893 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
921 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
922 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
894 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
923895 ; AVX512-NEXT: vzeroupper
924896 ; AVX512-NEXT: retq
925897 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
949921 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
950922 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
951923 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
952 ; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
953 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
924 ; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
954925 ; AVX-NEXT: vzeroupper
955926 ; AVX-NEXT: retq
956927 ;
962933 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
963934 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
964935 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
965 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
966 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
936 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
967937 ; AVX512-NEXT: vzeroupper
968938 ; AVX512-NEXT: retq
969939 %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
159159 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
160160 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
161161 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
162 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
162 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
163163 ; AVX2-NEXT: vmovq %xmm0, %rax
164164 ; AVX2-NEXT: vzeroupper
165165 ; AVX2-NEXT: retq
183183 ; AVX512BW-NEXT: vpaddq %ymm3, %ymm2, %ymm2
184184 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
185185 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
186 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
186 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
187187 ; AVX512BW-NEXT: vmovq %xmm0, %rax
188188 ; AVX512BW-NEXT: vzeroupper
189189 ; AVX512BW-NEXT: retq
207207 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
208208 ; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
209209 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
210 ; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
210 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
211211 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
212212 ; AVX512BWVL-NEXT: vzeroupper
213213 ; AVX512BWVL-NEXT: retq
228228 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
229229 ; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0
230230 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
231 ; AVX512DQVL-NEXT: vpmullq %ymm1, %ymm0, %ymm0
231 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
232232 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
233233 ; AVX512DQVL-NEXT: vzeroupper
234234 ; AVX512DQVL-NEXT: retq
351351 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
352352 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
353353 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
354 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
354 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
355355 ; AVX2-NEXT: vmovq %xmm0, %rax
356356 ; AVX2-NEXT: vzeroupper
357357 ; AVX2-NEXT: retq
384384 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
385385 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
386386 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
387 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
387 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
388388 ; AVX512BW-NEXT: vmovq %xmm0, %rax
389389 ; AVX512BW-NEXT: vzeroupper
390390 ; AVX512BW-NEXT: retq
417417 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
418418 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
419419 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
420 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
420 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
421421 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
422422 ; AVX512BWVL-NEXT: vzeroupper
423423 ; AVX512BWVL-NEXT: retq
441441 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
442442 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
443443 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
444 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
444 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
445445 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
446446 ; AVX512DQVL-NEXT: vzeroupper
447447 ; AVX512DQVL-NEXT: retq
654654 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
655655 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
656656 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
657 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
657 ; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
658658 ; AVX2-NEXT: vmovq %xmm0, %rax
659659 ; AVX2-NEXT: vzeroupper
660660 ; AVX2-NEXT: retq
695695 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
696696 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
697697 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
698 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
698 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
699699 ; AVX512BW-NEXT: vmovq %xmm0, %rax
700700 ; AVX512BW-NEXT: vzeroupper
701701 ; AVX512BW-NEXT: retq
736736 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
737737 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
738738 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
739 ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
739 ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
740740 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax
741741 ; AVX512BWVL-NEXT: vzeroupper
742742 ; AVX512BWVL-NEXT: retq
762762 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
763763 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
764764 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
765 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
765 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
766766 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
767767 ; AVX512DQVL-NEXT: vzeroupper
768768 ; AVX512DQVL-NEXT: retq
871871 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
872872 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
873873 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
874 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
874 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
875875 ; AVX2-NEXT: vmovd %xmm0, %eax
876876 ; AVX2-NEXT: vzeroupper
877877 ; AVX2-NEXT: retq
883883 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
884884 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
885885 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
886 ; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
886 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
887887 ; AVX512-NEXT: vmovd %xmm0, %eax
888888 ; AVX512-NEXT: vzeroupper
889889 ; AVX512-NEXT: retq
954954 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
955955 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
956956 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
957 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
957 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
958958 ; AVX2-NEXT: vmovd %xmm0, %eax
959959 ; AVX2-NEXT: vzeroupper
960960 ; AVX2-NEXT: retq
968968 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
969969 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
970970 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
971 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
971 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
972972 ; AVX512-NEXT: vmovd %xmm0, %eax
973973 ; AVX512-NEXT: vzeroupper
974974 ; AVX512-NEXT: retq
10631063 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
10641064 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
10651065 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1066 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
1066 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
10671067 ; AVX2-NEXT: vmovd %xmm0, %eax
10681068 ; AVX2-NEXT: vzeroupper
10691069 ; AVX2-NEXT: retq
10781078 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
10791079 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
10801080 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1081 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1081 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
10821082 ; AVX512-NEXT: vmovd %xmm0, %eax
10831083 ; AVX512-NEXT: vzeroupper
10841084 ; AVX512-NEXT: retq
11701170 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
11711171 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
11721172 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1173 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1173 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
11741174 ; AVX2-NEXT: vmovd %xmm0, %eax
11751175 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
11761176 ; AVX2-NEXT: vzeroupper
11851185 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
11861186 ; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0
11871187 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1188 ; AVX512-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1188 ; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
11891189 ; AVX512-NEXT: vmovd %xmm0, %eax
11901190 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
11911191 ; AVX512-NEXT: vzeroupper
12391239 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
12401240 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
12411241 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1242 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1242 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
12431243 ; AVX2-NEXT: vmovd %xmm0, %eax
12441244 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
12451245 ; AVX2-NEXT: vzeroupper
12561256 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
12571257 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
12581258 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
1259 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1259 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
12601260 ; AVX512BW-NEXT: vmovd %xmm0, %eax
12611261 ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
12621262 ; AVX512BW-NEXT: vzeroupper
12731273 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
12741274 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
12751275 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
1276 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1276 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
12771277 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax
12781278 ; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax
12791279 ; AVX512BWVL-NEXT: vzeroupper
12891289 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
12901290 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
12911291 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
1292 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1292 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
12931293 ; AVX512DQ-NEXT: vmovd %xmm0, %eax
12941294 ; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax
12951295 ; AVX512DQ-NEXT: vzeroupper
13051305 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
13061306 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
13071307 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1
1308 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1308 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13091309 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax
13101310 ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax
13111311 ; AVX512DQVL-NEXT: vzeroupper
13711371 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
13721372 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
13731373 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1374 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1374 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13751375 ; AVX2-NEXT: vmovd %xmm0, %eax
13761376 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
13771377 ; AVX2-NEXT: vzeroupper
13891389 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
13901390 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
13911391 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
1392 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1392 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13931393 ; AVX512BW-NEXT: vmovd %xmm0, %eax
13941394 ; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
13951395 ; AVX512BW-NEXT: vzeroupper
14071407 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
14081408 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
14091409 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
1410 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1410 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14111411 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax
14121412 ; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax
14131413 ; AVX512BWVL-NEXT: vzeroupper
14251425 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
14261426 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
14271427 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
1428 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1428 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14291429 ; AVX512DQ-NEXT: vmovd %xmm0, %eax
14301430 ; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax
14311431 ; AVX512DQ-NEXT: vzeroupper
14431443 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
14441444 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
14451445 ; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1
1446 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
1446 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14471447 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax
14481448 ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax
14491449 ; AVX512DQVL-NEXT: vzeroupper
4848 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4949 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
5050 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
51 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
51 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
5252 ; AVX1-NEXT: vmovq %xmm0, %rax
5353 ; AVX1-NEXT: vzeroupper
5454 ; AVX1-NEXT: retq
5858 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5959 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
6060 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
61 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
61 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
6262 ; AVX2-NEXT: vmovq %xmm0, %rax
6363 ; AVX2-NEXT: vzeroupper
6464 ; AVX2-NEXT: retq
6868 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
6969 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
7070 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
71 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
71 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
7272 ; AVX512-NEXT: vmovq %xmm0, %rax
7373 ; AVX512-NEXT: vzeroupper
7474 ; AVX512-NEXT: retq
9393 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
9494 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
9595 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
96 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
96 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
9797 ; AVX1-NEXT: vmovq %xmm0, %rax
9898 ; AVX1-NEXT: vzeroupper
9999 ; AVX1-NEXT: retq
104104 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
105105 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
106106 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
107 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
107 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
108108 ; AVX2-NEXT: vmovq %xmm0, %rax
109109 ; AVX2-NEXT: vzeroupper
110110 ; AVX2-NEXT: retq
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117117 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
119 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
121121 ; AVX512-NEXT: vzeroupper
122122 ; AVX512-NEXT: retq
147147 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
148148 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
149149 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
150 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
150 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
151151 ; AVX1-NEXT: vmovq %xmm0, %rax
152152 ; AVX1-NEXT: vzeroupper
153153 ; AVX1-NEXT: retq
160160 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
161161 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
162162 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
163 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
163 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
164164 ; AVX2-NEXT: vmovq %xmm0, %rax
165165 ; AVX2-NEXT: vzeroupper
166166 ; AVX2-NEXT: retq
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174174 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
176 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
178178 ; AVX512-NEXT: vzeroupper
179179 ; AVX512-NEXT: retq
234234 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
235235 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
236236 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
237 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
237 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
238238 ; AVX1-NEXT: vmovd %xmm0, %eax
239239 ; AVX1-NEXT: vzeroupper
240240 ; AVX1-NEXT: retq
246246 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
247247 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
248248 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
249 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
249 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
250250 ; AVX2-NEXT: vmovd %xmm0, %eax
251251 ; AVX2-NEXT: vzeroupper
252252 ; AVX2-NEXT: retq
258258 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
259259 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
260260 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
261 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
261 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
262262 ; AVX512-NEXT: vmovd %xmm0, %eax
263263 ; AVX512-NEXT: vzeroupper
264264 ; AVX512-NEXT: retq
287287 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
288288 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
289289 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
290 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
290 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
291291 ; AVX1-NEXT: vmovd %xmm0, %eax
292292 ; AVX1-NEXT: vzeroupper
293293 ; AVX1-NEXT: retq
300300 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
301301 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
302302 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
303 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
303 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
304304 ; AVX2-NEXT: vmovd %xmm0, %eax
305305 ; AVX2-NEXT: vzeroupper
306306 ; AVX2-NEXT: retq
314314 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
315315 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
316316 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
317 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
317 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
318318 ; AVX512-NEXT: vmovd %xmm0, %eax
319319 ; AVX512-NEXT: vzeroupper
320320 ; AVX512-NEXT: retq
349349 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
350350 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
351351 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
352 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
352 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
353353 ; AVX1-NEXT: vmovd %xmm0, %eax
354354 ; AVX1-NEXT: vzeroupper
355355 ; AVX1-NEXT: retq
364364 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
365365 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
366366 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
367 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
367 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
368368 ; AVX2-NEXT: vmovd %xmm0, %eax
369369 ; AVX2-NEXT: vzeroupper
370370 ; AVX2-NEXT: retq
379379 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
380380 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
381381 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
382 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
382 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
383383 ; AVX512-NEXT: vmovd %xmm0, %eax
384384 ; AVX512-NEXT: vzeroupper
385385 ; AVX512-NEXT: retq
456456 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
457457 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
458458 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
459 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
459 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
460460 ; AVX1-NEXT: vmovd %xmm0, %eax
461461 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
462462 ; AVX1-NEXT: vzeroupper
471471 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
472472 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
473473 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
474 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
474 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
475475 ; AVX2-NEXT: vmovd %xmm0, %eax
476476 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
477477 ; AVX2-NEXT: vzeroupper
486486 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
487487 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
488488 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
489 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
489 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
490490 ; AVX512-NEXT: vmovd %xmm0, %eax
491491 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
492492 ; AVX512-NEXT: vzeroupper
522522 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
523523 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
524524 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
525 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
525 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
526526 ; AVX1-NEXT: vmovd %xmm0, %eax
527527 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
528528 ; AVX1-NEXT: vzeroupper
538538 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
539539 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
540540 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
541 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
541 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
542542 ; AVX2-NEXT: vmovd %xmm0, %eax
543543 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
544544 ; AVX2-NEXT: vzeroupper
555555 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
556556 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
557557 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
558 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
558 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
559559 ; AVX512-NEXT: vmovd %xmm0, %eax
560560 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
561561 ; AVX512-NEXT: vzeroupper
597597 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
598598 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
599599 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
600 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
600 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
601601 ; AVX1-NEXT: vmovd %xmm0, %eax
602602 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
603603 ; AVX1-NEXT: vzeroupper
615615 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
616616 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
617617 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
618 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
618 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
619619 ; AVX2-NEXT: vmovd %xmm0, %eax
620620 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
621621 ; AVX2-NEXT: vzeroupper
633633 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
634634 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
635635 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
636 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
636 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
637637 ; AVX512-NEXT: vmovd %xmm0, %eax
638638 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
639639 ; AVX512-NEXT: vzeroupper
756756 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
757757 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
758758 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
759 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
759 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
760760 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
761761 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
762762 ; AVX1-NEXT: vzeroupper
773773 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
774774 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
775775 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
776 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
776 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
777777 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
778778 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
779779 ; AVX2-NEXT: vzeroupper
790790 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
791791 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
792792 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
793 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
793 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
794794 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
795795 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
796796 ; AVX512-NEXT: vzeroupper
850850 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
851851 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
852852 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
853 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
853 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
854854 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
855855 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
856856 ; AVX1-NEXT: vzeroupper
868868 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
869869 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
870870 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
871 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
871 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
872872 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
873873 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
874874 ; AVX2-NEXT: vzeroupper
887887 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
888888 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
889889 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
890 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
890 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
891891 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
892892 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
893893 ; AVX512-NEXT: vzeroupper
957957 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
958958 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
959959 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
960 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
960 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
961961 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
962962 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
963963 ; AVX1-NEXT: vzeroupper
977977 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
978978 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
979979 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
980 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
980 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
981981 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
982982 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
983983 ; AVX2-NEXT: vzeroupper
997997 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
998998 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
999999 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1000 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1000 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
10011001 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
10021002 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
10031003 ; AVX512-NEXT: vzeroupper
4848 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4949 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
5050 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
51 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
51 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
5252 ; AVX1-NEXT: vmovq %xmm0, %rax
5353 ; AVX1-NEXT: vzeroupper
5454 ; AVX1-NEXT: retq
5858 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5959 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
6060 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
61 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
61 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
6262 ; AVX2-NEXT: vmovq %xmm0, %rax
6363 ; AVX2-NEXT: vzeroupper
6464 ; AVX2-NEXT: retq
6868 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
6969 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
7070 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
71 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
71 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
7272 ; AVX512-NEXT: vmovq %xmm0, %rax
7373 ; AVX512-NEXT: vzeroupper
7474 ; AVX512-NEXT: retq
9393 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
9494 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
9595 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
96 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
96 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
9797 ; AVX1-NEXT: vmovq %xmm0, %rax
9898 ; AVX1-NEXT: vzeroupper
9999 ; AVX1-NEXT: retq
104104 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
105105 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
106106 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
107 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
107 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
108108 ; AVX2-NEXT: vmovq %xmm0, %rax
109109 ; AVX2-NEXT: vzeroupper
110110 ; AVX2-NEXT: retq
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117117 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
119 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
121121 ; AVX512-NEXT: vzeroupper
122122 ; AVX512-NEXT: retq
147147 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
148148 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
149149 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
150 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
150 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
151151 ; AVX1-NEXT: vmovq %xmm0, %rax
152152 ; AVX1-NEXT: vzeroupper
153153 ; AVX1-NEXT: retq
160160 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
161161 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
162162 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
163 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
163 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
164164 ; AVX2-NEXT: vmovq %xmm0, %rax
165165 ; AVX2-NEXT: vzeroupper
166166 ; AVX2-NEXT: retq
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174174 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
176 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
178178 ; AVX512-NEXT: vzeroupper
179179 ; AVX512-NEXT: retq
234234 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
235235 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
236236 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
237 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
237 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
238238 ; AVX1-NEXT: vmovd %xmm0, %eax
239239 ; AVX1-NEXT: vzeroupper
240240 ; AVX1-NEXT: retq
246246 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
247247 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
248248 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
249 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
249 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
250250 ; AVX2-NEXT: vmovd %xmm0, %eax
251251 ; AVX2-NEXT: vzeroupper
252252 ; AVX2-NEXT: retq
258258 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
259259 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
260260 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
261 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
261 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
262262 ; AVX512-NEXT: vmovd %xmm0, %eax
263263 ; AVX512-NEXT: vzeroupper
264264 ; AVX512-NEXT: retq
287287 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
288288 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
289289 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
290 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
290 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
291291 ; AVX1-NEXT: vmovd %xmm0, %eax
292292 ; AVX1-NEXT: vzeroupper
293293 ; AVX1-NEXT: retq
300300 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
301301 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
302302 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
303 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
303 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
304304 ; AVX2-NEXT: vmovd %xmm0, %eax
305305 ; AVX2-NEXT: vzeroupper
306306 ; AVX2-NEXT: retq
314314 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
315315 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
316316 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
317 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
317 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
318318 ; AVX512-NEXT: vmovd %xmm0, %eax
319319 ; AVX512-NEXT: vzeroupper
320320 ; AVX512-NEXT: retq
349349 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
350350 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
351351 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
352 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
352 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
353353 ; AVX1-NEXT: vmovd %xmm0, %eax
354354 ; AVX1-NEXT: vzeroupper
355355 ; AVX1-NEXT: retq
364364 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
365365 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
366366 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
367 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
367 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
368368 ; AVX2-NEXT: vmovd %xmm0, %eax
369369 ; AVX2-NEXT: vzeroupper
370370 ; AVX2-NEXT: retq
379379 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
380380 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
381381 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
382 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
382 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
383383 ; AVX512-NEXT: vmovd %xmm0, %eax
384384 ; AVX512-NEXT: vzeroupper
385385 ; AVX512-NEXT: retq
456456 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
457457 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
458458 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
459 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
459 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
460460 ; AVX1-NEXT: vmovd %xmm0, %eax
461461 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
462462 ; AVX1-NEXT: vzeroupper
471471 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
472472 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
473473 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
474 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
474 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
475475 ; AVX2-NEXT: vmovd %xmm0, %eax
476476 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
477477 ; AVX2-NEXT: vzeroupper
486486 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
487487 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
488488 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
489 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
489 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
490490 ; AVX512-NEXT: vmovd %xmm0, %eax
491491 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
492492 ; AVX512-NEXT: vzeroupper
522522 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
523523 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
524524 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
525 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
525 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
526526 ; AVX1-NEXT: vmovd %xmm0, %eax
527527 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
528528 ; AVX1-NEXT: vzeroupper
538538 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
539539 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
540540 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
541 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
541 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
542542 ; AVX2-NEXT: vmovd %xmm0, %eax
543543 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
544544 ; AVX2-NEXT: vzeroupper
555555 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
556556 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
557557 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
558 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
558 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
559559 ; AVX512-NEXT: vmovd %xmm0, %eax
560560 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
561561 ; AVX512-NEXT: vzeroupper
597597 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
598598 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
599599 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
600 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
600 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
601601 ; AVX1-NEXT: vmovd %xmm0, %eax
602602 ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
603603 ; AVX1-NEXT: vzeroupper
615615 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
616616 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
617617 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
618 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
618 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
619619 ; AVX2-NEXT: vmovd %xmm0, %eax
620620 ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
621621 ; AVX2-NEXT: vzeroupper
633633 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
634634 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
635635 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
636 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
636 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
637637 ; AVX512-NEXT: vmovd %xmm0, %eax
638638 ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
639639 ; AVX512-NEXT: vzeroupper
756756 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
757757 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
758758 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
759 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
759 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
760760 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
761761 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
762762 ; AVX1-NEXT: vzeroupper
773773 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
774774 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
775775 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
776 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
776 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
777777 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
778778 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
779779 ; AVX2-NEXT: vzeroupper
790790 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
791791 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
792792 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
793 ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
793 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
794794 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
795795 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
796796 ; AVX512-NEXT: vzeroupper
850850 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
851851 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
852852 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
853 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
853 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
854854 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
855855 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
856856 ; AVX1-NEXT: vzeroupper
868868 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
869869 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
870870 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
871 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
871 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
872872 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
873873 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
874874 ; AVX2-NEXT: vzeroupper
887887 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
888888 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
889889 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
890 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
890 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
891891 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
892892 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
893893 ; AVX512-NEXT: vzeroupper
957957 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
958958 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
959959 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
960 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
960 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
961961 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
962962 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
963963 ; AVX1-NEXT: vzeroupper
977977 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
978978 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
979979 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
980 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
980 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
981981 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
982982 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
983983 ; AVX2-NEXT: vzeroupper
997997 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
998998 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
999999 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1000 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1000 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
10011001 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
10021002 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
10031003 ; AVX512-NEXT: vzeroupper
689689 ; AVX2-NEXT: vpbroadcastw %xmm1, %ymm2
690690 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
691691 ; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm1
692 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
693 ; AVX2-NEXT: vpsubw %ymm2, %ymm3, %ymm2
692 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
693 ; AVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
694694 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
695695 ; AVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
696696 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
701701 ; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm2
702702 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
703703 ; AVX512F-NEXT: vpsllw %xmm1, %ymm0, %ymm1
704 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
705 ; AVX512F-NEXT: vpsubw %ymm2, %ymm3, %ymm2
704 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
705 ; AVX512F-NEXT: vpsubw %xmm2, %xmm3, %xmm2
706706 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
707707 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
708708 ; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0
713713 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %ymm2
714714 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
715715 ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm1
716 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
717 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm3, %ymm2
716 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
717 ; AVX512VL-NEXT: vpsubw %xmm2, %xmm3, %xmm2
718718 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
719719 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
720720 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
315315 ; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm3
316316 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
317317 ; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm4
318 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
319 ; AVX512F-NEXT: vpsubw %ymm3, %ymm5, %ymm3
318 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
319 ; AVX512F-NEXT: vpsubw %xmm3, %xmm5, %xmm3
320320 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
321321 ; AVX512F-NEXT: vpsrlw %xmm3, %ymm0, %ymm0
322322 ; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
330330 ; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm3
331331 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
332332 ; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm4
333 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
334 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm5, %ymm3
333 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
334 ; AVX512VL-NEXT: vpsubw %xmm3, %xmm5, %xmm3
335335 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
336336 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm0
337337 ; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
467467 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
468468 ; AVX512BW: # %bb.0:
469469 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm2
470 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
471 ; AVX512BW-NEXT: vpsubb %zmm2, %zmm3, %zmm2
472470 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
473471 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm3
474472 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
475473 ; AVX512BW-NEXT: vpsllw %xmm1, %zmm4, %zmm1
476474 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
477475 ; AVX512BW-NEXT: vpandq %zmm1, %zmm3, %zmm1
476 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
477 ; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm2
478478 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
479479 ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0
480480 ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm4, %zmm2
487487 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
488488 ; AVX512VLBW: # %bb.0:
489489 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm2
490 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
491 ; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm3, %zmm2
492490 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
493491 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm3
494492 ; AVX512VLBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
495493 ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm4, %zmm1
496494 ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
497495 ; AVX512VLBW-NEXT: vpandq %zmm1, %zmm3, %zmm1
496 ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
497 ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm2
498498 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
499499 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0
500500 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm4, %zmm2