llvm.org GIT mirror llvm / 935be04
[DAGCombiner] fold extract_subvector of extract_subvector This is the sibling fold for insert-of-insert that was added with D56604. Now that we have x86 shuffle narrowing (D57156), this change shows improvements for lots of AVX512 reduction code (not sure that we would ever expect extract-of-extract otherwise). There's a small regression in some of the partial-permute tests (extracting followed by splat). That is tracked by PR40500: https://bugs.llvm.org/show_bug.cgi?id=40500 Differential Revision: https://reviews.llvm.org/D57336 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352528 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 1 year, 30 days ago
19 changed file(s) with 503 addition(s) and 486 deletion(s). Raw diff Collapse all Expand all
1707817078 if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
1707917079 return NarrowLoad;
1708017080
17081 // Combine an extract of an extract into a single extract_subvector.
17082 // ext (ext X, C), 0 --> ext X, C
17083 if (isNullConstant(N->getOperand(1)) &&
17084 V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse() &&
17085 isa(V.getOperand(1))) {
17086 if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
17087 V.getConstantOperandVal(1)) &&
17088 TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
17089 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0),
17090 V.getOperand(1));
17091 }
17092 }
17093
1708117094 // Combine:
1708217095 // (extract_subvec (concat V1, V2, ...), i)
1708317096 // Into:
55 ; KNL-LABEL: hadd_16:
66 ; KNL: # %bb.0:
77 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8 ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
8 ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
99 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1010 ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1111 ; KNL-NEXT: vmovd %xmm0, %eax
1414 ; SKX-LABEL: hadd_16:
1515 ; SKX: # %bb.0:
1616 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
17 ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
17 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1818 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1919 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
2020 ; SKX-NEXT: vmovd %xmm0, %eax
3232 ; KNL-LABEL: hsub_16:
3333 ; KNL: # %bb.0:
3434 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
35 ; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
35 ; KNL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
3636 ; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
3737 ; KNL-NEXT: vpsubd %xmm1, %xmm0, %xmm0
3838 ; KNL-NEXT: vmovd %xmm0, %eax
4141 ; SKX-LABEL: hsub_16:
4242 ; SKX: # %bb.0:
4343 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
44 ; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
44 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
4545 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
4646 ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
4747 ; SKX-NEXT: vmovd %xmm0, %eax
5959 ; KNL-LABEL: fhadd_16:
6060 ; KNL: # %bb.0:
6161 ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
62 ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
62 ; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
6363 ; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
6464 ; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
6565 ; KNL-NEXT: retq
6767 ; SKX-LABEL: fhadd_16:
6868 ; SKX: # %bb.0:
6969 ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
70 ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
70 ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
7171 ; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7272 ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
7373 ; SKX-NEXT: vzeroupper
8484 ; KNL-LABEL: fhsub_16:
8585 ; KNL: # %bb.0:
8686 ; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
87 ; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
87 ; KNL-NEXT: vaddps %xmm1, %xmm0, %xmm0
8888 ; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8989 ; KNL-NEXT: vsubps %xmm1, %xmm0, %xmm0
9090 ; KNL-NEXT: retq
9292 ; SKX-LABEL: fhsub_16:
9393 ; SKX: # %bb.0:
9494 ; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
95 ; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
95 ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0
9696 ; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
9797 ; SKX-NEXT: vsubps %xmm1, %xmm0, %xmm0
9898 ; SKX-NEXT: vzeroupper
21722172 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
21732173 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
21742174 ; CHECK: # %bb.0:
2175 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2176 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
2175 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
2176 ; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3
2177 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,7]
21772178 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
21782179 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
21792180 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
21872188 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
21882189 ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
21892190 ; CHECK: # %bb.0:
2190 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
2191 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
2191 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2
2192 ; CHECK-NEXT: vpbroadcastq %xmm2, %ymm3
2193 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,7]
21922194 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
21932195 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
21942196 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
31203122 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
31213123 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
31223124 ; CHECK: # %bb.0:
3123 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3125 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
31243126 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
31253127 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
31263128 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
31383140 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
31393141 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
31403142 ; CHECK: # %bb.0:
3141 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3143 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
31423144 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
31433145 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
31443146 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
31893191 ; CHECK: # %bb.0:
31903192 ; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
31913193 ; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm1
3192 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3194 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
31933195 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
31943196 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
31953197 ; CHECK-NEXT: vzeroupper
32023204 ; CHECK: # %bb.0:
32033205 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
32043206 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
3205 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3207 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
32063208 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
32073209 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
32083210 ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
32213223 ; CHECK: # %bb.0:
32223224 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
32233225 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
3224 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3226 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0
32253227 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
32263228 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
32273229 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
38033805 define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
38043806 ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
38053807 ; CHECK: # %bb.0:
3806 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3807 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,1,4]
3808 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1
3809 ; CHECK-NEXT: vbroadcastsd %xmm1, %ymm2
3810 ; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,1,7]
38083811 ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
38093812 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
38103813 ; CHECK-NEXT: retq
38143817 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
38153818 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
38163819 ; CHECK: # %bb.0:
3817 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3818 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,1,4]
3820 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3821 ; CHECK-NEXT: vbroadcastsd %xmm3, %ymm3
3822 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,1,7]
38193823 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
38203824 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
38213825 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
38303834 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
38313835 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
38323836 ; CHECK: # %bb.0:
3833 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3834 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4]
3837 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
3838 ; CHECK-NEXT: vbroadcastsd %xmm2, %ymm3
3839 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,7]
38353840 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
38363841 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
38373842 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
38453850 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
38463851 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
38473852 ; CHECK: # %bb.0:
3848 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3849 ; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5]
3850 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
3851 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
3852 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
3853 ; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
3853 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
3854 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
3855 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
3856 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
3857 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
3858 ; CHECK-NEXT: vmovapd %ymm1, %ymm0
38543859 ; CHECK-NEXT: retq
38553860 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
38563861 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
38613866 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
38623867 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
38633868 ; CHECK: # %bb.0:
3864 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3865 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,5,5]
3866 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
3867 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
3868 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
3869 ; CHECK-NEXT: vmovapd %ymm2, %ymm0
3869 ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
3870 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
3871 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
3872 ; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
3873 ; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
38703874 ; CHECK-NEXT: retq
38713875 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32>
38723876 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
303303 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
304304 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
305305 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
306 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
306 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
307307 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
308 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
308 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
309309 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
310310 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
311311 ; AVX512-NEXT: vmovd %xmm0, %eax
496496 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
497497 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
498498 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
499 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
499 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
500500 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
501 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
501 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
502502 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
503503 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
504504 ; AVX512F-NEXT: vmovd %xmm0, %eax
525525 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
526526 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
527527 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
528 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
528 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
529529 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
530 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
530 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
531531 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
532532 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
533533 ; AVX512BW-NEXT: vmovd %xmm0, %eax
887887 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
888888 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
889889 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
890 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
890 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
891891 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
892 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
892 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
893893 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
894894 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
895895 ; AVX512-NEXT: vmovd %xmm0, %eax
11001100 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11011101 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
11021102 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1103 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1103 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
11041104 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1105 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1105 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
11061106 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
11071107 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
11081108 ; AVX512F-NEXT: vmovd %xmm0, %eax
11301130 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11311131 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
11321132 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1133 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1133 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
11341134 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1135 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1135 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
11361136 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
11371137 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
11381138 ; AVX512BW-NEXT: vmovd %xmm0, %eax
15171517 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
15181518 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
15191519 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1520 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1520 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15211521 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1522 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1522 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15231523 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
15241524 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15251525 ; AVX512-NEXT: vmovd %xmm0, %eax
17761776 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
17771777 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
17781778 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1779 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1779 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
17801780 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1781 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1781 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
17821782 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
17831783 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
17841784 ; AVX512-NEXT: vmovd %xmm0, %eax
252252 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
253253 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
254254 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
255 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
255 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
256256 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
257 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
257 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
258258 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
259259 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
260260 ; CHECK-NEXT: vmovd %xmm0, %eax
378378 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
379379 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
380380 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
381 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
381 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
382382 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
383 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
383 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
384384 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
385385 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
386386 ; CHECK-NEXT: vmovd %xmm0, %eax
102102 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
103103 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
104104 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
105 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
105 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
106106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
107 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
107 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
108108 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
109109 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
110110 ; AVX512-NEXT: vmovd %xmm0, %eax
365365 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
366366 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
367367 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
368 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
368 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
369369 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
370 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
370 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
371371 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
372372 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
373373 ; AVX512-NEXT: vmovd %xmm0, %eax
980980 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
981981 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
982982 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
983 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
983 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
984984 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
985 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
985 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
986986 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
987987 ; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0
988988 ; AVX512F-NEXT: vmovd %xmm0, %eax
10091009 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
10101010 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
10111011 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1012 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1012 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
10131013 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1014 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1014 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
10151015 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
10161016 ; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
10171017 ; AVX512BW-NEXT: vmovd %xmm0, %eax
14661466 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
14671467 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
14681468 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1469 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1469 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
14701470 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1471 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1471 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
14721472 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
14731473 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
14741474 ; AVX512-NEXT: vmovd %xmm0, %eax
15651565 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
15661566 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
15671567 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1568 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1568 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15691569 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1570 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1570 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15711571 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
15721572 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15731573 ; AVX512-NEXT: vmovd %xmm0, %eax
116116 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
117117 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
118118 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
119 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
119 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
121121 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
122122 ; AVX512-NEXT: vmovq %xmm0, %rax
179179 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
180180 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
181181 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
182 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
182 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
183183 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
184184 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
185185 ; AVX512-NEXT: vmovq %xmm0, %rax
345345 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
346346 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
347347 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
348 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
349 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
350 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
348 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
349 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
350 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
351351 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
352352 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
353353 ; AVX512-NEXT: vmovd %xmm0, %eax
416416 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
417417 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
418418 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
419 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
420 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
421 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
419 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
420 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
421 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
422422 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
423423 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
424424 ; AVX512-NEXT: vmovd %xmm0, %eax
656656 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
657657 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
658658 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
659 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
660 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
661 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
662 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
663 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
659 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
660 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
661 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
662 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
663 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
664664 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
665665 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
666666 ; AVX512-NEXT: vmovd %xmm0, %eax
740740 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
741741 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
742742 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
743 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
744 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
745 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
746 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
747 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
743 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
744 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
745 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
746 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
747 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
748748 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
749749 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
750750 ; AVX512-NEXT: vmovd %xmm0, %eax
11361136 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11371137 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
11381138 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1139 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1140 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1141 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1142 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1143 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1139 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1140 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1141 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1142 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1143 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
11441144 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1145 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1145 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
11461146 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
11471147 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
11481148 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12521252 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12531253 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
12541254 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1255 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1256 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1257 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1258 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1259 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1255 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1256 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1257 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1258 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1259 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
12601260 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1261 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1261 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
12621262 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
12631263 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
12641264 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
116116 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
117117 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
118118 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
119 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
119 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
121121 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
122122 ; AVX512-NEXT: vmovq %xmm0, %rax
179179 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
180180 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
181181 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
182 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
182 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
183183 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
184184 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
185185 ; AVX512-NEXT: vmovq %xmm0, %rax
345345 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
346346 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
347347 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
348 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
349 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
350 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
348 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
349 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
350 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
351351 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
352352 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
353353 ; AVX512-NEXT: vmovd %xmm0, %eax
416416 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
417417 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
418418 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
419 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
420 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
421 ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
419 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
420 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
421 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
422422 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
423423 ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
424424 ; AVX512-NEXT: vmovd %xmm0, %eax
654654 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
655655 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
656656 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
657 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
658 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
659 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
660 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
661 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
657 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
658 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
659 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
660 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
661 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
662662 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
663663 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
664664 ; AVX512-NEXT: vmovd %xmm0, %eax
738738 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
739739 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
740740 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
741 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
742 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
743 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
744 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
745 ; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
741 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
742 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
743 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
744 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
745 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
746746 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
747747 ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
748748 ; AVX512-NEXT: vmovd %xmm0, %eax
11261126 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11271127 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
11281128 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1129 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1130 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1131 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1132 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1133 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1129 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1130 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1131 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1132 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1133 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
11341134 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1135 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1135 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
11361136 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
11371137 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
11381138 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12421242 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12431243 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
12441244 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1245 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1246 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1247 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1248 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1249 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1245 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1246 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1247 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1248 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1249 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
12501250 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1251 ; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
1251 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
12521252 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
12531253 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
12541254 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
114114 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
115115 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
117 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119119 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
171171 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
172172 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
174 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176176 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
335335 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
336336 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
337337 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
338 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
341341 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342342 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
343343 ; AVX512-NEXT: vmovd %xmm0, %eax
400400 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
401401 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
402402 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
403 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
406406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407407 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
408408 ; AVX512-NEXT: vmovd %xmm0, %eax
638638 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
639639 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
640640 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
641 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
643 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
644 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
645 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
641 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
643 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
644 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
645 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
646646 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
647647 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
648648 ; AVX512-NEXT: vmovd %xmm0, %eax
716716 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
717717 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
718718 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
719 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
721 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
723 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
719 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
721 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
723 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
724724 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
725725 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
726726 ; AVX512-NEXT: vmovd %xmm0, %eax
11101110 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11111111 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
11121112 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1113 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1115 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1116 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1117 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1113 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1115 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1116 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1117 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
11181118 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1119 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1119 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
11201120 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
11211121 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
11221122 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12201220 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12211221 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
12221222 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1223 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1224 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1225 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1226 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1227 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1223 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1224 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1225 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1226 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1227 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
12281228 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1229 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1229 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
12301230 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
12311231 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
12321232 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
114114 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
115115 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
117 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119119 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
171171 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
172172 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
174 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176176 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
335335 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
336336 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
337337 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
338 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
341341 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342342 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
343343 ; AVX512-NEXT: vmovd %xmm0, %eax
400400 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
401401 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
402402 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
403 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
406406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407407 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
408408 ; AVX512-NEXT: vmovd %xmm0, %eax
636636 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
637637 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
638638 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
639 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
640 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
641 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
643 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
639 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
640 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
641 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
643 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
644644 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
645645 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
646646 ; AVX512-NEXT: vmovd %xmm0, %eax
714714 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
715715 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
716716 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
717 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
718 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
719 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
721 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
717 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
718 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
719 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
721 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
722722 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
723723 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
724724 ; AVX512-NEXT: vmovd %xmm0, %eax
11001100 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11011101 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
11021102 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1103 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1104 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1105 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1107 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1103 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1104 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1105 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1107 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
11081108 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1109 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1109 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
11101110 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
11111111 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
11121112 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12101210 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12111211 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
12121212 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1213 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1214 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1215 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1216 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1217 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1213 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1214 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1215 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
1216 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1217 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
12181218 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1219 ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
1219 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
12201220 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
12211221 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
12221222 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
167167 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
168168 ; AVX512-NEXT: vaddps %zmm0, %zmm1, %zmm0
169169 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
170 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
171 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
172 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
170 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
171 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
172 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
173173 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
174174 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
175175 ; AVX512-NEXT: vzeroupper
344344 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
345345 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
346346 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
347 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
348 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
349 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
347 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
348 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
349 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
350350 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
351351 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
352352 ; AVX512-NEXT: vzeroupper
521521 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
522522 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
523523 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
524 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
525 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
526 ; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
524 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
525 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
526 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
527527 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
528528 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
529529 ; AVX512-NEXT: vzeroupper
615615 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
616616 ; AVX512-NEXT: vaddpd %zmm0, %zmm1, %zmm0
617617 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
618 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
618 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
619619 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
620620 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
621621 ; AVX512-NEXT: vzeroupper
657657 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
658658 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
659659 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
660 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
660 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
661661 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
662662 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
663663 ; AVX512-NEXT: vzeroupper
752752 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
753753 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
754754 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
755 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
755 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
756756 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
757757 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
758758 ; AVX512-NEXT: vzeroupper
794794 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
795795 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
796796 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
797 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
797 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
798798 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
799799 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
800800 ; AVX512-NEXT: vzeroupper
889889 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
890890 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
891891 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
892 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
892 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
893893 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
894894 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
895895 ; AVX512-NEXT: vzeroupper
931931 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
932932 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
933933 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
934 ; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
934 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
935935 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
936936 ; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
937937 ; AVX512-NEXT: vzeroupper
167167 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
168168 ; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0
169169 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
170 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
171 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
172 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
170 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
171 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
172 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
173173 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
174174 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
175175 ; AVX512-NEXT: vzeroupper
344344 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
345345 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
346346 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
347 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
348 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
349 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
347 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
348 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
349 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
350350 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
351351 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
352352 ; AVX512-NEXT: vzeroupper
521521 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
522522 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
523523 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
524 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
525 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
526 ; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
524 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
525 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
526 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
527527 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
528528 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0
529529 ; AVX512-NEXT: vzeroupper
615615 ; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm0
616616 ; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0
617617 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
618 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
618 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
619619 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
620620 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
621621 ; AVX512-NEXT: vzeroupper
657657 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
658658 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
659659 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
660 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
660 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
661661 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
662662 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
663663 ; AVX512-NEXT: vzeroupper
752752 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
753753 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
754754 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
755 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
755 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
756756 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
757757 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
758758 ; AVX512-NEXT: vzeroupper
794794 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
795795 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
796796 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
797 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
797 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
798798 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
799799 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
800800 ; AVX512-NEXT: vzeroupper
889889 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
890890 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
891891 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
892 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
892 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
893893 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
894894 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
895895 ; AVX512-NEXT: vzeroupper
931931 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
932932 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
933933 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
934 ; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
934 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
935935 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
936936 ; AVX512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
937937 ; AVX512-NEXT: vzeroupper
439439 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
440440 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
441441 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
442 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
442 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
443443 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
444444 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
445445 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
760760 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
761761 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
762762 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
763 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
763 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
764764 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
765765 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
766766 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
996996 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
997997 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
998998 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
999 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
999 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
10001000 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1001 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1001 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
10021002 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
10031003 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
10041004 ; AVX512-NEXT: vmovd %xmm0, %eax
11061106 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11071107 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
11081108 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1109 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1109 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
11101110 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1111 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1111 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
11121112 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
11131113 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
11141114 ; AVX512-NEXT: vmovd %xmm0, %eax
13461346 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
13471347 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
13481348 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1349 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1349 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13501350 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1351 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1351 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13521352 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1353 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1353 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13541354 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
13551355 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13561356 ; AVX512BW-NEXT: vmovd %xmm0, %eax
13631363 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
13641364 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
13651365 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1366 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1366 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13671367 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1368 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1368 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13691369 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1370 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1370 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13711371 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
13721372 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13731373 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax
14791479 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
14801480 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
14811481 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1482 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1482 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14831483 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1484 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1484 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14851485 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1486 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1486 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14871487 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
14881488 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14891489 ; AVX512BW-NEXT: vmovd %xmm0, %eax
14971497 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
14981498 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
14991499 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1500 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1500 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
15011501 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1502 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1502 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
15031503 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1504 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1504 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
15051505 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
15061506 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
15071507 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax
31063106 ; AVX512BW: # %bb.0:
31073107 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
31083108 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3109 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm3
3110 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3111 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
3109 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
3110 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3111 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
31123112 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31133113 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31143114 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3115 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3116 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3115 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3116 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31173117 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3118 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3118 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
31193119 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3120 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm3, %zmm3
3121 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
3120 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3121 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
31223122 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31233123 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31243124 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3125 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3126 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3125 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3126 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31273127 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
3128 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3128 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
31293129 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3130 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm3, %zmm3
3131 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
3130 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3131 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
31323132 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31333133 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31343134 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3135 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3136 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3135 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3136 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31373137 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
31383138 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31393139 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31403140 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3141 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3141 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
31423142 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
31433143 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3144 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
3144 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
31453145 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3146 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3147 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3148 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3146 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3147 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3148 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
31493149 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3150 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm3
3150 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2
31513151 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3152 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3153 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3154 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3152 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3153 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3154 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
31553155 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3156 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm3
3156 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm2
31573157 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3158 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3159 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3160 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3158 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3159 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3160 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
31613161 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
31623162 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
31633163 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
31683168 ; AVX512BWVL: # %bb.0:
31693169 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
31703170 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3171 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm3
3172 ; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3173 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm3, %zmm3
3171 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2
3172 ; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3173 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
31743174 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31753175 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31763176 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3177 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3178 ; AVX512BWVL-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3177 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3178 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31793179 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3180 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3180 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
31813181 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3182 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm3, %zmm3
3183 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm3, %zmm3
3182 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3183 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
31843184 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31853185 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31863186 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3187 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3188 ; AVX512BWVL-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3187 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3188 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31893189 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
3190 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3190 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
31913191 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3192 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm3, %zmm3
3193 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm3, %zmm3
3192 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3193 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
31943194 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31953195 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31963196 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3197 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3198 ; AVX512BWVL-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3197 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3198 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31993199 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
32003200 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
32013201 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
32023202 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3203 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3203 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
32043204 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
32053205 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3206 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
3206 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
32073207 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3208 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3209 ; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3210 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3208 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3209 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3210 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
32113211 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3212 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm3
3212 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm2
32133213 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3214 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3215 ; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3216 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3214 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3215 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3216 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
32173217 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3218 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm3
3218 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm2
32193219 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3220 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3221 ; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3222 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3220 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3221 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3222 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
32233223 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
32243224 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
32253225 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
439439 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
440440 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
441441 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
442 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
442 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
443443 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
444444 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
445445 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
760760 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
761761 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
762762 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
763 ; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
763 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
764764 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
765765 ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
766766 ; AVX512DQVL-NEXT: vmovq %xmm0, %rax
989989 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
990990 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
991991 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
992 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
992 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
993993 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
994 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
994 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
995995 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
996996 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
997997 ; AVX512-NEXT: vmovd %xmm0, %eax
10991099 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11001100 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
11011101 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1102 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1102 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
11031103 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1104 ; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1104 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
11051105 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
11061106 ; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
11071107 ; AVX512-NEXT: vmovd %xmm0, %eax
13531353 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
13541354 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
13551355 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1356 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1356 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13571357 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1358 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1358 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13591359 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1360 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1360 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13611361 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
13621362 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13631363 ; AVX512BW-NEXT: vmovd %xmm0, %eax
13701370 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
13711371 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
13721372 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1373 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1373 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13741374 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1375 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1375 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13761376 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1377 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1377 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13781378 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
13791379 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
13801380 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax
14861486 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
14871487 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
14881488 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1489 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1489 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14901490 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1491 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1491 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14921492 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1493 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1493 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14941494 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
14951495 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
14961496 ; AVX512BW-NEXT: vmovd %xmm0, %eax
15041504 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
15051505 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
15061506 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
1507 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1507 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
15081508 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1509 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1509 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
15101510 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1511 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1511 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
15121512 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
15131513 ; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
15141514 ; AVX512BWVL-NEXT: vmovd %xmm0, %eax
30613061 ; AVX512BW: # %bb.0:
30623062 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
30633063 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3064 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm3
3065 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3066 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
3064 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
3065 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3066 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
30673067 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
30683068 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
30693069 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3070 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3071 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3070 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3071 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
30723072 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3073 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3073 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
30743074 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3075 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm3, %zmm3
3076 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
3075 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3076 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
30773077 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
30783078 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
30793079 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3080 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3081 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3080 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3081 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
30823082 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
3083 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3083 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
30843084 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3085 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm3, %zmm3
3086 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3
3085 ; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3086 ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
30873087 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
30883088 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
30893089 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3090 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3091 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3090 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
3091 ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
30923092 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
30933093 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
30943094 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
30953095 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3096 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3096 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
30973097 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
30983098 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3099 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
3099 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
31003100 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3101 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3102 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3103 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3101 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3102 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3103 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
31043104 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3105 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm3
3105 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm2
31063106 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3107 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3108 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3109 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3107 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3108 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3109 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
31103110 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3111 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm3
3111 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm2
31123112 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3113 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3114 ; AVX512BW-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3115 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
3113 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3114 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3115 ; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
31163116 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
31173117 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
31183118 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
31233123 ; AVX512BWVL: # %bb.0:
31243124 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
31253125 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3126 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm3
3127 ; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3128 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm3, %zmm3
3126 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2
3127 ; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3128 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
31293129 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31303130 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31313131 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3132 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3133 ; AVX512BWVL-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3132 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3133 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31343134 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
3135 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3135 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
31363136 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3137 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm3, %zmm3
3138 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm3, %zmm3
3137 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3138 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
31393139 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31403140 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31413141 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3142 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3143 ; AVX512BWVL-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3142 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3143 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31443144 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
3145 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
3145 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
31463146 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
3147 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm3, %zmm3
3148 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm3, %zmm3
3147 ; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2
3148 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
31493149 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31503150 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31513151 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3152 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3153 ; AVX512BWVL-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
3152 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
3153 ; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
31543154 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
31553155 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
31563156 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
31573157 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
3158 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3158 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
31593159 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
31603160 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3161 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
3161 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
31623162 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3163 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3164 ; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3165 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3163 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3164 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3165 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
31663166 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3167 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm3
3167 ; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm2
31683168 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3169 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3170 ; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3171 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3169 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3170 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3171 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
31723172 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
3173 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm3
3173 ; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm2
31743174 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
3175 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm3[0],zmm0[0],zmm3[1],zmm0[1],zmm3[2],zmm0[2],zmm3[3],zmm0[3],zmm3[4],zmm0[4],zmm3[5],zmm0[5],zmm3[6],zmm0[6],zmm3[7],zmm0[7],zmm3[16],zmm0[16],zmm3[17],zmm0[17],zmm3[18],zmm0[18],zmm3[19],zmm0[19],zmm3[20],zmm0[20],zmm3[21],zmm0[21],zmm3[22],zmm0[22],zmm3[23],zmm0[23],zmm3[32],zmm0[32],zmm3[33],zmm0[33],zmm3[34],zmm0[34],zmm3[35],zmm0[35],zmm3[36],zmm0[36],zmm3[37],zmm0[37],zmm3[38],zmm0[38],zmm3[39],zmm0[39],zmm3[48],zmm0[48],zmm3[49],zmm0[49],zmm3[50],zmm0[50],zmm3[51],zmm0[51],zmm3[52],zmm0[52],zmm3[53],zmm0[53],zmm3[54],zmm0[54],zmm3[55],zmm0[55]
3176 ; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm0, %zmm0
3177 ; AVX512BWVL-NEXT: vpandq %zmm2, %zmm0, %zmm0
3175 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm2 = zmm2[0],zmm0[0],zmm2[1],zmm0[1],zmm2[2],zmm0[2],zmm2[3],zmm0[3],zmm2[4],zmm0[4],zmm2[5],zmm0[5],zmm2[6],zmm0[6],zmm2[7],zmm0[7],zmm2[16],zmm0[16],zmm2[17],zmm0[17],zmm2[18],zmm0[18],zmm2[19],zmm0[19],zmm2[20],zmm0[20],zmm2[21],zmm0[21],zmm2[22],zmm0[22],zmm2[23],zmm0[23],zmm2[32],zmm0[32],zmm2[33],zmm0[33],zmm2[34],zmm0[34],zmm2[35],zmm0[35],zmm2[36],zmm0[36],zmm2[37],zmm0[37],zmm2[38],zmm0[38],zmm2[39],zmm0[39],zmm2[48],zmm0[48],zmm2[49],zmm0[49],zmm2[50],zmm0[50],zmm2[51],zmm0[51],zmm2[52],zmm0[52],zmm2[53],zmm0[53],zmm2[54],zmm0[54],zmm2[55],zmm0[55]
3176 ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0
3177 ; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
31783178 ; AVX512BWVL-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
31793179 ; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
31803180 ; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
114114 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
115115 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
117 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119119 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
171171 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
172172 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
174 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176176 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
335335 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
336336 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
337337 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
338 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
341341 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342342 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
343343 ; AVX512-NEXT: vmovd %xmm0, %eax
400400 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
401401 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
402402 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
403 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
406406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407407 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
408408 ; AVX512-NEXT: vmovd %xmm0, %eax
638638 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
639639 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
640640 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
641 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
643 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
644 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
645 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
641 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
643 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
644 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
645 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
646646 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
647647 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
648648 ; AVX512-NEXT: vmovd %xmm0, %eax
716716 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
717717 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
718718 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
719 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
721 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
723 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
719 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
721 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
723 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
724724 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
725725 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
726726 ; AVX512-NEXT: vmovd %xmm0, %eax
11101110 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11111111 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
11121112 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1113 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1115 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1116 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1117 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1113 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1115 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1116 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1117 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
11181118 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1119 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1119 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
11201120 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
11211121 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
11221122 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12201220 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12211221 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
12221222 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1223 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1224 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1225 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1226 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1227 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1223 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1224 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1225 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1226 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1227 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
12281228 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1229 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1229 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
12301230 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
12311231 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
12321232 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
114114 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
115115 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
117 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119119 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
171171 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
172172 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
174 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176176 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
335335 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
336336 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
337337 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
338 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
341341 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342342 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
343343 ; AVX512-NEXT: vmovd %xmm0, %eax
400400 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
401401 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
402402 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
403 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
406406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407407 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
408408 ; AVX512-NEXT: vmovd %xmm0, %eax
636636 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
637637 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
638638 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
639 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
640 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
641 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
643 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
639 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
640 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
641 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
643 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
644644 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
645645 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
646646 ; AVX512-NEXT: vmovd %xmm0, %eax
714714 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
715715 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
716716 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
717 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
718 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
719 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
721 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
717 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
718 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
719 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
721 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
722722 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
723723 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
724724 ; AVX512-NEXT: vmovd %xmm0, %eax
11001100 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11011101 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
11021102 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1103 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1104 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1105 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1107 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1103 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1104 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1105 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1107 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
11081108 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1109 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1109 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
11101110 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
11111111 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
11121112 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12101210 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12111211 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
12121212 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1213 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1214 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1215 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1216 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1217 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1213 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1214 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1215 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
1216 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1217 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
12181218 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1219 ; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
1219 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
12201220 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
12211221 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
12221222 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
114114 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
115115 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
117 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119119 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
171171 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
172172 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
174 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176176 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
335335 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
336336 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
337337 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
338 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
341341 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342342 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
343343 ; AVX512-NEXT: vmovd %xmm0, %eax
400400 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
401401 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
402402 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
403 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
406406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407407 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
408408 ; AVX512-NEXT: vmovd %xmm0, %eax
638638 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
639639 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
640640 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
641 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
643 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
644 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
645 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
641 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
643 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
644 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
645 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
646646 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
647647 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
648648 ; AVX512-NEXT: vmovd %xmm0, %eax
716716 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
717717 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
718718 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
719 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
721 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
723 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
719 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
721 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
723 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
724724 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
725725 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
726726 ; AVX512-NEXT: vmovd %xmm0, %eax
11101110 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11111111 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
11121112 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1113 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1115 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1116 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1117 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1113 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1115 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1116 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1117 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
11181118 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1119 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1119 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
11201120 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
11211121 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
11221122 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12201220 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12211221 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
12221222 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1223 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1224 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1225 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1226 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1227 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1223 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1224 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1225 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1226 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1227 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
12281228 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1229 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1229 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
12301230 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
12311231 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
12321232 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
114114 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
115115 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
116116 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
117 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
117 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
118118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
119119 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
120120 ; AVX512-NEXT: vmovq %xmm0, %rax
171171 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
172172 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
173173 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
174 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
174 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
175175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
176176 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
177177 ; AVX512-NEXT: vmovq %xmm0, %rax
335335 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
336336 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
337337 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
338 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
338 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
340 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
341341 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
342342 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
343343 ; AVX512-NEXT: vmovd %xmm0, %eax
400400 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
401401 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
402402 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
403 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
403 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
404 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
405 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
406406 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
407407 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
408408 ; AVX512-NEXT: vmovd %xmm0, %eax
636636 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
637637 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
638638 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
639 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
640 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
641 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
643 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
639 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
640 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
641 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
642 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
643 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
644644 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
645645 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
646646 ; AVX512-NEXT: vmovd %xmm0, %eax
714714 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
715715 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
716716 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
717 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
718 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
719 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
721 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
717 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
718 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
719 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
720 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
721 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
722722 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
723723 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
724724 ; AVX512-NEXT: vmovd %xmm0, %eax
11001100 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
11011101 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
11021102 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1103 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1104 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1105 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1107 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1103 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1104 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1105 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1106 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1107 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
11081108 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1109 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1109 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
11101110 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
11111111 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
11121112 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12101210 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
12111211 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
12121212 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1213 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1214 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1215 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1216 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1217 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1213 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1214 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
1215 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
1216 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1217 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
12181218 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1219 ; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1219 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
12201220 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
12211221 ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
12221222 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
337337 define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
338338 ; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
339339 ; ALL: # %bb.0:
340 ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
340 ; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1
341341 ; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
342342 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
343343 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,u]