llvm.org GIT mirror llvm / af156f2
[AVX-512] Remove patterns that select vmovdqu8/16 for unmasked loads. Prefer vmovdqa64/vmovdqu64 instead. These were taking priority over the aligned load instructions since there is no vmovda8/16. I don't think there is really a difference between aligned and unaligned on newer cpus so I don't think it matters which instructions we use. But with this change we reduce the size of the isel table a little and we allow the aligned information to pass through to the evex->vec pass and produce the same output has avx/avx2 in some cases. I also generally dislike patterns rooted in a bitcast which these were. Differential Revision: https://reviews.llvm.org/D35977 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@309589 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 2 years ago
38 changed file(s) with 326 addition(s) and 460 deletion(s). Raw diff Collapse all Expand all
32483248
32493249 multiclass avx512_load opc, string OpcodeStr, X86VectorVTInfo _,
32503250 PatFrag ld_frag, PatFrag mload,
3251 bit NoRMPattern = 0,
32513252 SDPatternOperator SelectOprr = vselect> {
32523253 let hasSideEffects = 0 in {
32533254 def rr : AVX512PI
32623263 _.ImmAllZerosV)))], _.ExeDomain>,
32633264 EVEX, EVEX_KZ;
32643265
3265 let canFoldAsLoad = 1, isReMaterializable = 1,
3266 let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1,
32663267 SchedRW = [WriteLoad] in
32673268 def rm : AVX512PI
32683269 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
3269 [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
3270 !if(NoRMPattern, [],
3271 [(set _.RC:$dst,
3272 (_.VT (bitconvert (ld_frag addr:$src))))]),
32703273 _.ExeDomain>, EVEX;
32713274
32723275 let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
33263329 multiclass avx512_load_vl opc, string OpcodeStr,
33273330 AVX512VLVectorVTInfo _,
33283331 Predicate prd,
3332 bit NoRMPattern = 0,
33293333 SDPatternOperator SelectOprr = vselect> {
33303334 let Predicates = [prd] in
33313335 defm Z : avx512_load
3332 masked_load_unaligned, SelectOprr>, EVEX_V512;
3336 masked_load_unaligned, NoRMPattern,
3337 SelectOprr>, EVEX_V512;
33333338
33343339 let Predicates = [prd, HasVLX] in {
33353340 defm Z256 : avx512_load
3336 masked_load_unaligned, SelectOprr>, EVEX_V256;
3341 masked_load_unaligned, NoRMPattern,
3342 SelectOprr>, EVEX_V256;
33373343 defm Z128 : avx512_load
3338 masked_load_unaligned, SelectOprr>, EVEX_V128;
3344 masked_load_unaligned, NoRMPattern,
3345 SelectOprr>, EVEX_V128;
33393346 }
33403347 }
33413348
34153422 PD, VEX_W, EVEX_CD8<64, CD8VF>;
34163423
34173424 defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
3418 null_frag>,
3425 0, null_frag>,
34193426 avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
34203427 "VMOVUPS">,
34213428 PS, EVEX_CD8<32, CD8VF>;
34223429
34233430 defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
3424 null_frag>,
3431 0, null_frag>,
34253432 avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
34263433 "VMOVUPD">,
34273434 PD, VEX_W, EVEX_CD8<64, CD8VF>;
34383445 HasAVX512, "VMOVDQA64">,
34393446 PD, VEX_W, EVEX_CD8<64, CD8VF>;
34403447
3441 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
3448 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>,
34423449 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
34433450 HasBWI, "VMOVDQU8">,
34443451 XD, EVEX_CD8<8, CD8VF>;
34453452
3446 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
3453 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>,
34473454 avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
34483455 HasBWI, "VMOVDQU16">,
34493456 XD, VEX_W, EVEX_CD8<16, CD8VF>;
34503457
34513458 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
3452 null_frag>,
3459 0, null_frag>,
34533460 avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
34543461 HasAVX512, "VMOVDQU32">,
34553462 XS, EVEX_CD8<32, CD8VF>;
34563463
34573464 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
3458 null_frag>,
3465 0, null_frag>,
34593466 avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
34603467 HasAVX512, "VMOVDQU64">,
34613468 XS, VEX_W, EVEX_CD8<64, CD8VF>;
709709 ;
710710 ; AVX512BW-LABEL: avg_v64i8:
711711 ; AVX512BW: # BB#0:
712 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
712 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
713713 ; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
714714 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
715715 ; AVX512BW-NEXT: vzeroupper
10981098 ;
10991099 ; AVX512BW-LABEL: avg_v32i16:
11001100 ; AVX512BW: # BB#0:
1101 ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
1101 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
11021102 ; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
11031103 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
11041104 ; AVX512BW-NEXT: vzeroupper
17311731 ;
17321732 ; AVX512BW-LABEL: avg_v64i8_2:
17331733 ; AVX512BW: # BB#0:
1734 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
1734 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
17351735 ; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
17361736 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
17371737 ; AVX512BW-NEXT: vzeroupper
21212121 ;
21222122 ; AVX512BW-LABEL: avg_v32i16_2:
21232123 ; AVX512BW: # BB#0:
2124 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
2124 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
21252125 ; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
21262126 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
21272127 ; AVX512BW-NEXT: vzeroupper
26462646 ;
26472647 ; AVX512BW-LABEL: avg_v64i8_const:
26482648 ; AVX512BW: # BB#0:
2649 ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
2649 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
26502650 ; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
26512651 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
26522652 ; AVX512BW-NEXT: vzeroupper
29542954 ;
29552955 ; AVX512BW-LABEL: avg_v32i16_const:
29562956 ; AVX512BW: # BB#0:
2957 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
2957 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
29582958 ; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
29592959 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
29602960 ; AVX512BW-NEXT: vzeroupper
762762 ; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0:
763763 ; AVX512VL: ## BB#0:
764764 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
765 ; AVX512VL-NEXT: vmovdqu (%eax), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x08]
765 ; AVX512VL-NEXT: vmovdqa (%eax), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x08]
766766 ; AVX512VL-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x04,0xc0]
767767 ; AVX512VL-NEXT: retl ## encoding: [0xc3]
768768 %a0 = load <32 x i8>, <32 x i8>* %ptr
12941294 ; SKX-NEXT: vpmovm2w %k0, %zmm0
12951295 ; SKX-NEXT: kmovd %eax, %k0
12961296 ; SKX-NEXT: vpmovm2w %k0, %zmm1
1297 ; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
1297 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
12981298 ; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
12991299 ; SKX-NEXT: vpmovw2m %zmm2, %k0
13001300 ; SKX-NEXT: kmovd %k0, %eax
5252 define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
5353 ; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512:
5454 ; AVX512BW: ## BB#0:
55 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
55 ; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
5656 ; AVX512BW-NEXT: kmovd %edx, %k1
5757 ; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1}
5858 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z}
6363 ; AVX512F-32: # BB#0:
6464 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
6565 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
66 ; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0
66 ; AVX512F-32-NEXT: vmovdqu64 (%ecx), %zmm0
6767 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
6868 ; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
6969 ; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z}
8181 define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
8282 ; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512:
8383 ; AVX512BW: ## BB#0:
84 ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
84 ; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
8585 ; AVX512BW-NEXT: kmovq %rdx, %k1
8686 ; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1}
8787 ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z}
9292 ; AVX512F-32: # BB#0:
9393 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
9494 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
95 ; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0
95 ; AVX512F-32-NEXT: vmovdqu64 (%ecx), %zmm0
9696 ; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
9797 ; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1}
9898 ; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z}
17091709 define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
17101710 ; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
17111711 ; AVX512BW: ## BB#0:
1712 ; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
1712 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
17131713 ; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
17141714 ; AVX512BW-NEXT: retq
17151715 ;
17161716 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
17171717 ; AVX512F-32: # BB#0:
1718 ; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
1718 ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
17191719 ; AVX512F-32-NEXT: vpsravw {{\.LCPI.*}}, %zmm0, %zmm0
17201720 ; AVX512F-32-NEXT: retl
17211721 %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> ,
33 define <64 x i8> @test1(i8 * %addr) {
44 ; CHECK-LABEL: test1:
55 ; CHECK: ## BB#0:
6 ; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0
6 ; CHECK-NEXT: vmovups (%rdi), %zmm0
77 ; CHECK-NEXT: retq
88 %vaddr = bitcast i8* %addr to <64 x i8>*
99 %res = load <64 x i8>, <64 x i8>* %vaddr, align 1
5151 define <32 x i16> @test5(i8 * %addr) {
5252 ; CHECK-LABEL: test5:
5353 ; CHECK: ## BB#0:
54 ; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0
54 ; CHECK-NEXT: vmovups (%rdi), %zmm0
5555 ; CHECK-NEXT: retq
5656 %vaddr = bitcast i8* %addr to <32 x i16>*
5757 %res = load <32 x i16>, <32 x i16>* %vaddr, align 1
33 define <32 x i8> @test_256_1(i8 * %addr) {
44 ; CHECK-LABEL: test_256_1:
55 ; CHECK: ## BB#0:
6 ; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
6 ; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
77 ; CHECK-NEXT: retq ## encoding: [0xc3]
88 %vaddr = bitcast i8* %addr to <32 x i8>*
99 %res = load <32 x i8>, <32 x i8>* %vaddr, align 1
5151 define <16 x i16> @test_256_5(i8 * %addr) {
5252 ; CHECK-LABEL: test_256_5:
5353 ; CHECK: ## BB#0:
54 ; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
54 ; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
5555 ; CHECK-NEXT: retq ## encoding: [0xc3]
5656 %vaddr = bitcast i8* %addr to <16 x i16>*
5757 %res = load <16 x i16>, <16 x i16>* %vaddr, align 1
9999 define <16 x i8> @test_128_1(i8 * %addr) {
100100 ; CHECK-LABEL: test_128_1:
101101 ; CHECK: ## BB#0:
102 ; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
102 ; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
103103 ; CHECK-NEXT: retq ## encoding: [0xc3]
104104 %vaddr = bitcast i8* %addr to <16 x i8>*
105105 %res = load <16 x i8>, <16 x i8>* %vaddr, align 1
147147 define <8 x i16> @test_128_5(i8 * %addr) {
148148 ; CHECK-LABEL: test_128_5:
149149 ; CHECK: ## BB#0:
150 ; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
150 ; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
151151 ; CHECK-NEXT: retq ## encoding: [0xc3]
152152 %vaddr = bitcast i8* %addr to <8 x i16>*
153153 %res = load <8 x i16>, <8 x i16>* %vaddr, align 1
17491749 ;
17501750 ; AVX512BW-LABEL: test_unaligned_v32i16:
17511751 ; AVX512BW: # BB#0:
1752 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
1752 ; AVX512BW-NEXT: vmovups (%rdi), %zmm0
17531753 ; AVX512BW-NEXT: retq
17541754 ;
17551755 ; AVX512VL-LABEL: test_unaligned_v32i16:
17841784 ;
17851785 ; AVX512BW-LABEL: test_unaligned_v64i8:
17861786 ; AVX512BW: # BB#0:
1787 ; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
1787 ; AVX512BW-NEXT: vmovups (%rdi), %zmm0
17881788 ; AVX512BW-NEXT: retq
17891789 ;
17901790 ; AVX512VL-LABEL: test_unaligned_v64i8:
920920 ; AVX512BW-LABEL: mul_v64i8c:
921921 ; AVX512BW: # BB#0: # %entry
922922 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
923 ; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
923 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
924924 ; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
925925 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
926926 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
813813 ; AVX512BW-NEXT: .p2align 4, 0x90
814814 ; AVX512BW-NEXT: .LBB2_1: # %vector.body
815815 ; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
816 ; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm2
816 ; AVX512BW-NEXT: vmovdqa64 a+1024(%rax), %zmm2
817817 ; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2
818818 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
819819 ; AVX512BW-NEXT: addq $4, %rax
5757 ;
5858 ; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
5959 ; AVX512BWVL: # BB#0:
60 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0
60 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
6161 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
6262 ; AVX512BWVL-NEXT: retq
6363 %vec = load <16 x i8>, <16 x i8>* %L
112112 ;
113113 ; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
114114 ; AVX512BWVL: # BB#0:
115 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0
115 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
116116 ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
117117 ; AVX512BWVL-NEXT: retq
118118 %vec = load <16 x i8>, <16 x i8>* %L
6060 ;
6161 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
6262 ; AVX512BWVL: # BB#0:
63 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
63 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
6464 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
6565 ; AVX512BWVL-NEXT: vzeroupper
6666 ; AVX512BWVL-NEXT: retq
121121 ;
122122 ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
123123 ; AVX512BWVL: # BB#0:
124 ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
124 ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
125125 ; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
126126 ; AVX512BWVL-NEXT: vzeroupper
127127 ; AVX512BWVL-NEXT: retq
3232 ;
3333 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
3434 ; AVX512BW: # BB#0:
35 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
35 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3636 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
3737 ; AVX512BW-NEXT: vzeroupper
3838 ; AVX512BW-NEXT: retq
3939 ;
4040 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
4141 ; AVX512BWVL: # BB#0:
42 ; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
42 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
4343 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
4444 ; AVX512BWVL-NEXT: vzeroupper
4545 ; AVX512BWVL-NEXT: retq
7474 ;
7575 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
7676 ; AVX512BW: # BB#0:
77 ; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
77 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7878 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
7979 ; AVX512BW-NEXT: vzeroupper
8080 ; AVX512BW-NEXT: retq
8181 ;
8282 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
8383 ; AVX512BWVL: # BB#0:
84 ; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
84 ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
8585 ; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
8686 ; AVX512BWVL-NEXT: vzeroupper
8787 ; AVX512BWVL-NEXT: retq
5151 ; SKX: ## BB#0:
5252 ; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
5353 ; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
54 ; SKX-NEXT: vmovdqu (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00]
54 ; SKX-NEXT: vmovdqa (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00]
5555 ; SKX-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
5656 ; SKX-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
5757 ; SKX-NEXT: vpcmpestri $7, (%ecx), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0x01,0x07]
297297 ; SKX: ## BB#0:
298298 ; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
299299 ; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
300 ; SKX-NEXT: vmovdqu (%ecx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
300 ; SKX-NEXT: vmovdqa (%ecx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x01]
301301 ; SKX-NEXT: vpcmpistri $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0x00,0x07]
302302 ; SKX-NEXT: movl %ecx, %eax ## encoding: [0x89,0xc8]
303303 ; SKX-NEXT: retl ## encoding: [0xc3]
202202 ; SKX-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
203203 ; SKX: ## BB#0:
204204 ; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
205 ; SKX-NEXT: vmovdqu (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x08]
205 ; SKX-NEXT: vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
206206 ; SKX-NEXT: vpmaddubsw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x04,0xc0]
207207 ; SKX-NEXT: retl ## encoding: [0xc3]
208208 %a0 = load <16 x i8>, <16 x i8>* %ptr
995995 ; X32-AVX512BW: ## BB#0:
996996 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
997997 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
998 ; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0
998 ; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
999999 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
10001000 ; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10011001 ; X32-AVX512BW-NEXT: retl
10251025 ;
10261026 ; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
10271027 ; X64-AVX512BW: ## BB#0:
1028 ; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
1028 ; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
10291029 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
10301030 ; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10311031 ; X64-AVX512BW-NEXT: retq
10651065 ; X32-AVX512BW: ## BB#0:
10661066 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
10671067 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
1068 ; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0
1068 ; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
10691069 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
10701070 ; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10711071 ; X32-AVX512BW-NEXT: retl
10951095 ;
10961096 ; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
10971097 ; X64-AVX512BW: ## BB#0:
1098 ; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
1098 ; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
10991099 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
11001100 ; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11011101 ; X64-AVX512BW-NEXT: retq
14341434 ; AVX512BW: # BB#0:
14351435 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
14361436 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
1437 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1437 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
14381438 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
14391439 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
14401440 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
1441 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1441 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
14421442 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
14431443 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
14441444 ; AVX512BW-NEXT: retq
17481748 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
17491749 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
17501750 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
1751 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1751 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
17521752 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
17531753 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
17541754 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
1755 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1755 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
17561756 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
17571757 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
17581758 ; AVX512BW-NEXT: retq
20682068 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
20692069 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
20702070 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
2071 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2071 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
20722072 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
20732073 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
20742074 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
2075 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2075 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
20762076 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
20772077 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
20782078 ; AVX512BW-NEXT: retq
24082408 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
24092409 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
24102410 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
2411 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2411 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
24122412 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
24132413 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
24142414 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
2415 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2415 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
24162416 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
24172417 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
24182418 ; AVX512BW-NEXT: retq
199199 ; AVX512VLBWDQ: # BB#0:
200200 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
201201 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
202 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
202 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
203203 ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
204204 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
205205 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
464464 ; AVX512VLBWDQ: # BB#0:
465465 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
466466 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
467 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
467 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
468468 ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
469469 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
470470 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
716716 ; AVX512VLBWDQ: # BB#0:
717717 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
718718 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
719 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
719 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
720720 ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
721721 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
722722 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
957957 ; AVX512VLBWDQ: # BB#0:
958958 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
959959 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
960 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
960 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
961961 ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
962962 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
963963 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
11691169 ; AVX512VLBWDQ: # BB#0:
11701170 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
11711171 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1172 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1172 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
11731173 ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
11741174 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
11751175 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
13731373 ; AVX512VLBWDQ: # BB#0:
13741374 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
13751375 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1376 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1376 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
13771377 ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
13781378 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
13791379 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
15541554 ; AVX512VLBWDQ: # BB#0:
15551555 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
15561556 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1557 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1557 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
15581558 ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
15591559 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
15601560 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
17161716 ; AVX512VLBWDQ: # BB#0:
17171717 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
17181718 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
1719 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1719 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
17201720 ; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
17211721 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
17221722 ; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
18731873 ;
18741874 ; AVX512VLBWDQ-LABEL: foldv8i16:
18751875 ; AVX512VLBWDQ: # BB#0:
1876 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1876 ; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
18771877 ; AVX512VLBWDQ-NEXT: retq
18781878 ;
18791879 ; X32-SSE-LABEL: foldv8i16:
18971897 ;
18981898 ; AVX512VLBWDQ-LABEL: foldv8i16u:
18991899 ; AVX512VLBWDQ: # BB#0:
1900 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
1900 ; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
19011901 ; AVX512VLBWDQ-NEXT: retq
19021902 ;
19031903 ; X32-SSE-LABEL: foldv8i16u:
19211921 ;
19221922 ; AVX512VLBWDQ-LABEL: foldv16i8:
19231923 ; AVX512VLBWDQ: # BB#0:
1924 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1924 ; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
19251925 ; AVX512VLBWDQ-NEXT: retq
19261926 ;
19271927 ; X32-SSE-LABEL: foldv16i8:
19451945 ;
19461946 ; AVX512VLBWDQ-LABEL: foldv16i8u:
19471947 ; AVX512VLBWDQ: # BB#0:
1948 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
1948 ; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
19491949 ; AVX512VLBWDQ-NEXT: retq
19501950 ;
19511951 ; X32-SSE-LABEL: foldv16i8u:
128128 ; AVX512VLBWDQ: # BB#0:
129129 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
130130 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
131 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
131 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
132132 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
133133 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
134134 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
320320 ; AVX512VLBWDQ: # BB#0:
321321 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
322322 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
323 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
323 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
324324 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
325325 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
326326 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
492492 ; AVX512VLBWDQ: # BB#0:
493493 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
494494 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
495 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
495 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
496496 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
497497 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
498498 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
654654 ; AVX512VLBWDQ: # BB#0:
655655 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
656656 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
657 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
657 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
658658 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
659659 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
660660 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
796796 ; AVX512VLBWDQ: # BB#0:
797797 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
798798 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
799 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
799 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
800800 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
801801 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
802802 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
923923 ; AVX512VLBWDQ: # BB#0:
924924 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
925925 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
926 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
926 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
927927 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
928928 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
929929 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
10301030 ; AVX512VLBWDQ: # BB#0:
10311031 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
10321032 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
1033 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1033 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
10341034 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
10351035 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
10361036 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
11341134 ; AVX512VLBWDQ: # BB#0:
11351135 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
11361136 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
1137 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
1137 ; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
11381138 ; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
11391139 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
11401140 ; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
12351235 }
12361236
12371237 define <16 x i16> @foldv16i16() nounwind {
1238 ; NOBW-LABEL: foldv16i16:
1239 ; NOBW: # BB#0:
1240 ; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
1241 ; NOBW-NEXT: retq
1242 ;
1243 ; AVX512VLBWDQ-LABEL: foldv16i16:
1244 ; AVX512VLBWDQ: # BB#0:
1245 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
1246 ; AVX512VLBWDQ-NEXT: retq
1238 ; X64-LABEL: foldv16i16:
1239 ; X64: # BB#0:
1240 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
1241 ; X64-NEXT: retq
12471242 ;
12481243 ; X32-AVX-LABEL: foldv16i16:
12491244 ; X32-AVX: # BB#0:
12541249 }
12551250
12561251 define <16 x i16> @foldv16i16u() nounwind {
1257 ; NOBW-LABEL: foldv16i16u:
1258 ; NOBW: # BB#0:
1259 ; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
1260 ; NOBW-NEXT: retq
1261 ;
1262 ; AVX512VLBWDQ-LABEL: foldv16i16u:
1263 ; AVX512VLBWDQ: # BB#0:
1264 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
1265 ; AVX512VLBWDQ-NEXT: retq
1252 ; X64-LABEL: foldv16i16u:
1253 ; X64: # BB#0:
1254 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
1255 ; X64-NEXT: retq
12661256 ;
12671257 ; X32-AVX-LABEL: foldv16i16u:
12681258 ; X32-AVX: # BB#0:
12731263 }
12741264
12751265 define <32 x i8> @foldv32i8() nounwind {
1276 ; NOBW-LABEL: foldv32i8:
1277 ; NOBW: # BB#0:
1278 ; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
1279 ; NOBW-NEXT: retq
1280 ;
1281 ; AVX512VLBWDQ-LABEL: foldv32i8:
1282 ; AVX512VLBWDQ: # BB#0:
1283 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
1284 ; AVX512VLBWDQ-NEXT: retq
1266 ; X64-LABEL: foldv32i8:
1267 ; X64: # BB#0:
1268 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
1269 ; X64-NEXT: retq
12851270 ;
12861271 ; X32-AVX-LABEL: foldv32i8:
12871272 ; X32-AVX: # BB#0:
12921277 }
12931278
12941279 define <32 x i8> @foldv32i8u() nounwind {
1295 ; NOBW-LABEL: foldv32i8u:
1296 ; NOBW: # BB#0:
1297 ; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
1298 ; NOBW-NEXT: retq
1299 ;
1300 ; AVX512VLBWDQ-LABEL: foldv32i8u:
1301 ; AVX512VLBWDQ: # BB#0:
1302 ; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
1303 ; AVX512VLBWDQ-NEXT: retq
1280 ; X64-LABEL: foldv32i8u:
1281 ; X64: # BB#0:
1282 ; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
1283 ; X64-NEXT: retq
13041284 ;
13051285 ; X32-AVX-LABEL: foldv32i8u:
13061286 ; X32-AVX: # BB#0:
3030 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
3131 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3232 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
33 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
33 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
3434 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
3535 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
3636 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
109109 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
110110 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
111111 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
112 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
112 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
113113 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
114114 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
115115 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
186186 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
187187 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
188188 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
189 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
189 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
190190 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
191191 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
192192 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
273273 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
274274 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
275275 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
276 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
276 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
277277 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
278278 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
279279 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
376376 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
377377 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
378378 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
379 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
379 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
380380 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
381381 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
382382 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
466466 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
467467 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
468468 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
469 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
469 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
470470 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
471471 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
472472 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
581581 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
582582 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
583583 ; AVX512BW-NEXT: vpandnq %zmm2, %zmm0, %zmm1
584 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
584 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
585585 ; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
586586 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
587587 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
683683 ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
684684 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
685685 ; AVX512BW-NEXT: vpandnq %zmm2, %zmm0, %zmm1
686 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
686 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
687687 ; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
688688 ; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
689689 ; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
3131 ; AVX512BW: # BB#0:
3232 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
3333 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
34 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
34 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
3535 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
3636 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
3737 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
8585 ; AVX512BW: # BB#0:
8686 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
8787 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
88 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
88 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
8989 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
9090 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
9191 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
136136 ; AVX512BW: # BB#0:
137137 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
138138 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
139 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
139 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
140140 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
141141 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
142142 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
161161 ; AVX512VPOPCNTDQ-BW: # BB#0:
162162 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
163163 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
164 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
164 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
165165 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
166166 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
167167 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
198198 ; AVX512BW: # BB#0:
199199 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
200200 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
201 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
201 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
202202 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
203203 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
204204 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
228228 ; AVX512VPOPCNTDQ-BW: # BB#0:
229229 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
230230 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
231 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
231 ; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
232232 ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
233233 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
234234 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
475475 ;
476476 ; AVX512VL-LABEL: var_rotate_v8i16:
477477 ; AVX512VL: # BB#0:
478 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
478 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
479479 ; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm2
480480 ; AVX512VL-NEXT: vpsllvw %xmm1, %xmm0, %xmm1
481481 ; AVX512VL-NEXT: vpsrlvw %xmm2, %xmm0, %xmm0
700700 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
701701 ; AVX-NEXT: retq
702702 ;
703 ; AVX512BW-LABEL: var_rotate_v16i8:
704 ; AVX512BW: # BB#0:
705 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
706 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
707 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
708 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
709 ; AVX512BW-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
710 ; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
711 ; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
712 ; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
713 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
714 ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
715 ; AVX512BW-NEXT: vzeroupper
716 ; AVX512BW-NEXT: retq
717 ;
718 ; AVX512VL-LABEL: var_rotate_v16i8:
719 ; AVX512VL: # BB#0:
720 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
721 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2
722 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
723 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
724 ; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
725 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
726 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
727 ; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
728 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
729 ; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0
730 ; AVX512VL-NEXT: vzeroupper
731 ; AVX512VL-NEXT: retq
703 ; AVX512-LABEL: var_rotate_v16i8:
704 ; AVX512: # BB#0:
705 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
706 ; AVX512-NEXT: vpsubb %xmm1, %xmm2, %xmm2
707 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
708 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
709 ; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
710 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1
711 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
712 ; AVX512-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
713 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
714 ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
715 ; AVX512-NEXT: vzeroupper
716 ; AVX512-NEXT: retq
732717 ;
733718 ; XOP-LABEL: var_rotate_v16i8:
734719 ; XOP: # BB#0:
281281 ;
282282 ; AVX512VL-LABEL: var_rotate_v16i16:
283283 ; AVX512VL: # BB#0:
284 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
284 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
285285 ; AVX512VL-NEXT: vpsubw %ymm1, %ymm2, %ymm2
286286 ; AVX512VL-NEXT: vpsllvw %ymm1, %ymm0, %ymm1
287287 ; AVX512VL-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0
406406 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
407407 ; AVX2-NEXT: retq
408408 ;
409 ; AVX512BW-LABEL: var_rotate_v32i8:
410 ; AVX512BW: # BB#0:
411 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
412 ; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
413 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
414 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
415 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
416 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
417 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
418 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
419 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
420 ; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
421 ; AVX512BW-NEXT: retq
422 ;
423 ; AVX512VL-LABEL: var_rotate_v32i8:
424 ; AVX512VL: # BB#0:
425 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
426 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm2
427 ; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
428 ; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
429 ; AVX512VL-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
430 ; AVX512VL-NEXT: vpmovwb %zmm1, %ymm1
431 ; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
432 ; AVX512VL-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
433 ; AVX512VL-NEXT: vpmovwb %zmm0, %ymm0
434 ; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
435 ; AVX512VL-NEXT: retq
409 ; AVX512-LABEL: var_rotate_v32i8:
410 ; AVX512: # BB#0:
411 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
412 ; AVX512-NEXT: vpsubb %ymm1, %ymm2, %ymm2
413 ; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
414 ; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
415 ; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
416 ; AVX512-NEXT: vpmovwb %zmm1, %ymm1
417 ; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
418 ; AVX512-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
419 ; AVX512-NEXT: vpmovwb %zmm0, %ymm0
420 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
421 ; AVX512-NEXT: retq
436422 ;
437423 ; XOPAVX1-LABEL: var_rotate_v32i8:
438424 ; XOPAVX1: # BB#0:
8080 ;
8181 ; AVX512BW-LABEL: var_rotate_v32i16:
8282 ; AVX512BW: # BB#0:
83 ; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
83 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
8484 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
8585 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
8686 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
8989 ;
9090 ; AVX512VLBW-LABEL: var_rotate_v32i16:
9191 ; AVX512VLBW: # BB#0:
92 ; AVX512VLBW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
92 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
9393 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
9494 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
9595 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
225225 ;
226226 ; AVX512BW-LABEL: var_rotate_v64i8:
227227 ; AVX512BW: # BB#0:
228 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
228 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
229229 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
230230 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
231231 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
260260 ;
261261 ; AVX512VLBW-LABEL: var_rotate_v64i8:
262262 ; AVX512VLBW: # BB#0:
263 ; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
263 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
264264 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
265265 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
266266 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
482482 ;
483483 ; AVX512BW-LABEL: constant_rotate_v64i8:
484484 ; AVX512BW: # BB#0:
485 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
485 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
486486 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
487487 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
488488 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
495495 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
496496 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
497497 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
498 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
498 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
499499 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
500500 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
501501 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
515515 ;
516516 ; AVX512VLBW-LABEL: constant_rotate_v64i8:
517517 ; AVX512VLBW: # BB#0:
518 ; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
518 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
519519 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
520520 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
521521 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
528528 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
529529 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
530530 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
531 ; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
531 ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
532532 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
533533 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3
534534 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
355355 ; AVX512BW: # BB#0:
356356 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
357357 ; AVX512BW-NEXT: vpsraw $4, %zmm1, %zmm2
358 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
358 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
359359 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm3[8],zmm0[9],zmm3[9],zmm0[10],zmm3[10],zmm0[11],zmm3[11],zmm0[12],zmm3[12],zmm0[13],zmm3[13],zmm0[14],zmm3[14],zmm0[15],zmm3[15],zmm0[24],zmm3[24],zmm0[25],zmm3[25],zmm0[26],zmm3[26],zmm0[27],zmm3[27],zmm0[28],zmm3[28],zmm0[29],zmm3[29],zmm0[30],zmm3[30],zmm0[31],zmm3[31],zmm0[40],zmm3[40],zmm0[41],zmm3[41],zmm0[42],zmm3[42],zmm0[43],zmm3[43],zmm0[44],zmm3[44],zmm0[45],zmm3[45],zmm0[46],zmm3[46],zmm0[47],zmm3[47],zmm0[56],zmm3[56],zmm0[57],zmm3[57],zmm0[58],zmm3[58],zmm0[59],zmm3[59],zmm0[60],zmm3[60],zmm0[61],zmm3[61],zmm0[62],zmm3[62],zmm0[63],zmm3[63]
360360 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
361361 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
268268 ;
269269 ; AVX512BW-LABEL: constant_shift_v64i8:
270270 ; AVX512BW: # BB#0:
271 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
271 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
272272 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
273273 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
274274 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
251251 ;
252252 ; AVX512BW-LABEL: constant_shift_v64i8:
253253 ; AVX512BW: # BB#0:
254 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
254 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
255255 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
256256 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
257257 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
13831383 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
13841384 ; SSE41-NEXT: retq
13851385 ;
1386 ; AVX1OR2-LABEL: PR12412:
1387 ; AVX1OR2: # BB#0: # %entry
1388 ; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1389 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1390 ; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1391 ; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1392 ; AVX1OR2-NEXT: retq
1393 ;
1394 ; AVX512VL-LABEL: PR12412:
1395 ; AVX512VL: # BB#0: # %entry
1396 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1397 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1398 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1399 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1400 ; AVX512VL-NEXT: retq
1386 ; AVX-LABEL: PR12412:
1387 ; AVX: # BB#0: # %entry
1388 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1389 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1390 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1391 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1392 ; AVX-NEXT: retq
14011393 entry:
14021394 %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32>
14031395 ret <16 x i8> %0
167167 ;
168168 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
169169 ; AVX512VL: # BB#0:
170 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
170 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
171171 ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
172172 ; AVX512VL-NEXT: retq
173173 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
195195 ;
196196 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
197197 ; AVX512VL: # BB#0:
198 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
198 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
199199 ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
200200 ; AVX512VL-NEXT: retq
201201 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
222222 ;
223223 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
224224 ; AVX512VL: # BB#0:
225 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
225 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
226226 ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
227227 ; AVX512VL-NEXT: retq
228228 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
249249 ;
250250 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
251251 ; AVX512VL: # BB#0:
252 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
252 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
253253 ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
254254 ; AVX512VL-NEXT: retq
255255 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
275275 ;
276276 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
277277 ; AVX512VL: # BB#0:
278 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
278 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
279279 ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
280280 ; AVX512VL-NEXT: retq
281281 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
301301 ;
302302 ; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
303303 ; AVX512VL: # BB#0:
304 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
304 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
305305 ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
306306 ; AVX512VL-NEXT: retq
307307 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
327327 ;
328328 ; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
329329 ; AVX512VL: # BB#0:
330 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
330 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
331331 ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
332332 ; AVX512VL-NEXT: retq
333333 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
850850 ;
851851 ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
852852 ; AVX512VL: # BB#0:
853 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24]
853 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24]
854854 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
855855 ; AVX512VL-NEXT: retq
856856 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
877877 ;
878878 ; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
879879 ; AVX512VL: # BB#0:
880 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31]
880 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31]
881881 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
882882 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
883883 ; AVX512VL-NEXT: retq
910910 ;
911911 ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
912912 ; AVX512VL: # BB#0:
913 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28]
913 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28]
914914 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
915915 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
916916 ; AVX512VL-NEXT: retq
941941 ;
942942 ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
943943 ; AVX512VL: # BB#0:
944 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24]
944 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24]
945945 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
946946 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
947947 ; AVX512VL-NEXT: retq
11301130 ;
11311131 ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
11321132 ; AVX512VL: # BB#0:
1133 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31]
1133 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31]
11341134 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
11351135 ; AVX512VL-NEXT: retq
11361136 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
11561156 ;
11571157 ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
11581158 ; AVX512VL: # BB#0:
1159 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27]
1159 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27]
11601160 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
11611161 ; AVX512VL-NEXT: retq
11621162 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
14091409 ;
14101410 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
14111411 ; AVX512VL: # BB#0:
1412 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20]
1412 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20]
14131413 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
14141414 ; AVX512VL-NEXT: retq
14151415 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
14361436 ;
14371437 ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
14381438 ; AVX512VL: # BB#0:
1439 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20]
1439 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20]
14401440 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
14411441 ; AVX512VL-NEXT: retq
14421442 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
14641464 ;
14651465 ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
14661466 ; AVX512VL: # BB#0:
1467 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28]
1467 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28]
14681468 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
14691469 ; AVX512VL-NEXT: retq
14701470 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
14911491 ;
14921492 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
14931493 ; AVX512VL: # BB#0:
1494 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28]
1494 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28]
14951495 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
14961496 ; AVX512VL-NEXT: retq
14971497 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
15151515 ;
15161516 ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
15171517 ; AVX512VL: # BB#0:
1518 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
1518 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
15191519 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
15201520 ; AVX512VL-NEXT: retq
15211521 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
17041704 ;
17051705 ; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
17061706 ; AVX512VL: # BB#0:
1707 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15]
1707 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15]
17081708 ; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
17091709 ; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
17101710 ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
18011801 ;
18021802 ; AVX512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
18031803 ; AVX512VL: # BB#0:
1804 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16]
1804 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16]
18051805 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
18061806 ; AVX512VL-NEXT: retq
18071807 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
18241824 ;
18251825 ; AVX512VL-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
18261826 ; AVX512VL: # BB#0:
1827 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22]
1827 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22]
18281828 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
18291829 ; AVX512VL-NEXT: retq
18301830 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
19011901 ;
19021902 ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
19031903 ; AVX512VL: # BB#0:
1904 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27]
1904 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27]
19051905 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
19061906 ; AVX512VL-NEXT: retq
19071907 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
21402140 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
21412141 ; AVX1-NEXT: retq
21422142 ;
2143 ; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
2144 ; AVX2: # BB#0:
2145 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2146 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
2147 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2148 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2149 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2150 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2151 ; AVX2-NEXT: retq
2152 ;
2153 ; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
2154 ; AVX512VL: # BB#0:
2155 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2156 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
2157 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2158 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2159 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2160 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2161 ; AVX512VL-NEXT: retq
2143 ; AVX2OR512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
2144 ; AVX2OR512VL: # BB#0:
2145 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2146 ; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
2147 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2148 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
2149 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2150 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2151 ; AVX2OR512VL-NEXT: retq
21622152 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
21632153 ret <16 x i16> %shuffle
21642154 }
22942284 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
22952285 ; AVX1-NEXT: retq
22962286 ;
2297 ; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
2298 ; AVX2: # BB#0:
2299 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2300 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
2301 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2302 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2303 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2304 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2305 ; AVX2-NEXT: retq
2306 ;
2307 ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
2308 ; AVX512VL: # BB#0:
2309 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2310 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
2311 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2312 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2313 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2314 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2315 ; AVX512VL-NEXT: retq
2287 ; AVX2OR512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
2288 ; AVX2OR512VL: # BB#0:
2289 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2290 ; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
2291 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2292 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2293 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2294 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2295 ; AVX2OR512VL-NEXT: retq
23162296 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
23172297 ret <16 x i16> %shuffle
23182298 }
23282308 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
23292309 ; AVX1-NEXT: retq
23302310 ;
2331 ; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
2332 ; AVX2: # BB#0:
2333 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2334 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
2335 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2336 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2337 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2338 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2339 ; AVX2-NEXT: retq
2340 ;
2341 ; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
2342 ; AVX512VL: # BB#0:
2343 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2344 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
2345 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2346 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2347 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2348 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2349 ; AVX512VL-NEXT: retq
2311 ; AVX2OR512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
2312 ; AVX2OR512VL: # BB#0:
2313 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2314 ; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
2315 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2316 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
2317 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2318 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2319 ; AVX2OR512VL-NEXT: retq
23502320 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
23512321 ret <16 x i16> %shuffle
23522322 }
23622332 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
23632333 ; AVX1-NEXT: retq
23642334 ;
2365 ; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
2366 ; AVX2: # BB#0:
2367 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2368 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
2369 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2370 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2371 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2372 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2373 ; AVX2-NEXT: retq
2374 ;
2375 ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
2376 ; AVX512VL: # BB#0:
2377 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2378 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
2379 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2380 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2381 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2382 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2383 ; AVX512VL-NEXT: retq
2335 ; AVX2OR512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
2336 ; AVX2OR512VL: # BB#0:
2337 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2338 ; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
2339 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2340 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2341 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2342 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2343 ; AVX2OR512VL-NEXT: retq
23842344 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
23852345 ret <16 x i16> %shuffle
23862346 }
23962356 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
23972357 ; AVX1-NEXT: retq
23982358 ;
2399 ; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
2400 ; AVX2: # BB#0:
2401 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2402 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
2403 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2404 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
2405 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2406 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2407 ; AVX2-NEXT: retq
2408 ;
2409 ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
2410 ; AVX512VL: # BB#0:
2411 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2412 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
2413 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2414 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
2415 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2416 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2417 ; AVX512VL-NEXT: retq
2359 ; AVX2OR512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
2360 ; AVX2OR512VL: # BB#0:
2361 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2362 ; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
2363 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2364 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
2365 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2366 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2367 ; AVX2OR512VL-NEXT: retq
24182368 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
24192369 ret <16 x i16> %shuffle
24202370 }
27942744 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
27952745 ; AVX1-NEXT: retq
27962746 ;
2797 ; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
2798 ; AVX2: # BB#0:
2799 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2800 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
2801 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2802 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
2803 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2804 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2805 ; AVX2-NEXT: retq
2806 ;
2807 ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
2808 ; AVX512VL: # BB#0:
2809 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2810 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
2811 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2812 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
2813 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2814 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2815 ; AVX512VL-NEXT: retq
2747 ; AVX2OR512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
2748 ; AVX2OR512VL: # BB#0:
2749 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2750 ; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
2751 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
2752 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
2753 ; AVX2OR512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2754 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
2755 ; AVX2OR512VL-NEXT: retq
28162756 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
28172757 ret <16 x i16> %shuffle
28182758 }
28432783 ;
28442784 ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
28452785 ; AVX512VL: # BB#0:
2846 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
2786 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
28472787 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
28482788 ; AVX512VL-NEXT: retq
28492789 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
28762816 ;
28772817 ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
28782818 ; AVX512VL: # BB#0:
2879 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31]
2819 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31]
28802820 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
28812821 ; AVX512VL-NEXT: retq
28822822 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
29092849 ;
29102850 ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
29112851 ; AVX512VL: # BB#0:
2912 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
2852 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
29132853 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
29142854 ; AVX512VL-NEXT: retq
29152855 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
29422882 ;
29432883 ; AVX512VL-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
29442884 ; AVX512VL: # BB#0:
2945 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27]
2885 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27]
29462886 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
29472887 ; AVX512VL-NEXT: retq
29482888 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
29822922 ;
29832923 ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
29842924 ; AVX512VL: # BB#0:
2985 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31]
2925 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31]
29862926 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
29872927 ; AVX512VL-NEXT: retq
29882928 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
30192959 ;
30202960 ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
30212961 ; AVX512VL: # BB#0:
3022 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25]
2962 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25]
30232963 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
30242964 ; AVX512VL-NEXT: retq
30252965 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
30552995 ;
30562996 ; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
30572997 ; AVX512VL: # BB#0:
3058 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26]
2998 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26]
30592999 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
30603000 ; AVX512VL-NEXT: retq
30613001 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
30883028 ;
30893029 ; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
30903030 ; AVX512VL: # BB#0:
3091 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
3031 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
30923032 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
30933033 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
30943034 ; AVX512VL-NEXT: retq
31223062 ;
31233063 ; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
31243064 ; AVX512VL: # BB#0:
3125 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
3065 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
31263066 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
31273067 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
31283068 ; AVX512VL-NEXT: retq
31603100 ;
31613101 ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
31623102 ; AVX512VL: # BB#0:
3163 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31]
3103 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31]
31643104 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
31653105 ; AVX512VL-NEXT: retq
31663106 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
31903130 ;
31913131 ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
31923132 ; AVX512VL: # BB#0:
3193 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u>
3133 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u>
31943134 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
31953135 ; AVX512VL-NEXT: retq
31963136 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
32183158 ;
32193159 ; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
32203160 ; AVX512VL: # BB#0:
3221 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u>
3161 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u>
32223162 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
32233163 ; AVX512VL-NEXT: retq
32243164 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
32593199 ;
32603200 ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
32613201 ; AVX512VL: # BB#0:
3262 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u>
3202 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u>
32633203 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
32643204 ; AVX512VL-NEXT: retq
32653205 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
32853225 ;
32863226 ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
32873227 ; AVX512VL: # BB#0:
3288 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 =
3228 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 =
32893229 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
32903230 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
32913231 ; AVX512VL-NEXT: retq
33133253 ;
33143254 ; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
33153255 ; AVX512VL: # BB#0:
3316 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>
3256 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>
33173257 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
33183258 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
33193259 ; AVX512VL-NEXT: retq
33473287 ;
33483288 ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
33493289 ; AVX512VL: # BB#0:
3350 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11]
3290 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11]
33513291 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
33523292 ; AVX512VL-NEXT: retq
33533293 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
33733313 ;
33743314 ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
33753315 ; AVX512VL: # BB#0:
3376 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15]
3316 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15]
33773317 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
33783318 ; AVX512VL-NEXT: retq
33793319 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
34063346 ;
34073347 ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
34083348 ; AVX512VL: # BB#0:
3409 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 =
3349 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 =
34103350 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
34113351 ; AVX512VL-NEXT: retq
34123352 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
34373377 ;
34383378 ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
34393379 ; AVX512VL: # BB#0:
3440 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 =
3380 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 =
34413381 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
34423382 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
34433383 ; AVX512VL-NEXT: retq
34683408 ;
34693409 ; AVX512VL-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
34703410 ; AVX512VL: # BB#0:
3471 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
3411 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
34723412 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
34733413 ; AVX512VL-NEXT: retq
34743414 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
35743514 ;
35753515 ; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
35763516 ; AVX512VL: # BB#0:
3577 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
3517 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
35783518 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
35793519 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
35803520 ; AVX512VL-NEXT: retq
36813621 ;
36823622 ; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
36833623 ; AVX512VL: # BB#0:
3684 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
3624 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
36853625 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
36863626 ; AVX512VL-NEXT: retq
36873627 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
37293669 ;
37303670 ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
37313671 ; AVX512VL: # BB#0:
3732 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
3672 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
37333673 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
37343674 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
37353675 ; AVX512VL-NEXT: retq
37773717 ;
37783718 ; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
37793719 ; AVX512VL: # BB#0:
3780 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>
3720 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>
37813721 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
37823722 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
37833723 ; AVX512VL-NEXT: retq
38953835 ;
38963836 ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
38973837 ; AVX512VL: # BB#0:
3898 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
3838 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
38993839 ; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
39003840 ; AVX512VL-NEXT: retq
39013841 %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32>
39323872 ;
39333873 ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
39343874 ; AVX512VL: # BB#0:
3935 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
3875 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
39363876 ; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
39373877 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
39383878 ; AVX512VL-NEXT: retq
40223962 ;
40233963 ; AVX512VL-LABEL: PR24935:
40243964 ; AVX512VL: # BB#0:
4025 ; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8]
3965 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8]
40263966 ; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
40273967 ; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
40283968 ; AVX512VL-NEXT: retq
5252 ;
5353 ; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
5454 ; SKX: ## BB#0:
55 ; SKX-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
55 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
5656 ; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
5757 ; SKX-NEXT: retq
5858 %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32>
8585 ;
8686 ; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
8787 ; SKX: ## BB#0:
88 ; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
88 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
8989 ; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
9090 ; SKX-NEXT: retq
9191 %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32>
193193 ;
194194 ; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
195195 ; AVX512VBMI: # BB#0:
196 ; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
196 ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
197197 ; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
198198 ; AVX512VBMI-NEXT: retq
199199 %shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32>
491491 ; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
492492 ; AVX512VBMI: # BB#0:
493493 ; AVX512VBMI-NEXT: vpxord %zmm1, %zmm1, %zmm1
494 ; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127]
494 ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127]
495495 ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
496496 ; AVX512VBMI-NEXT: retq
497497 %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32>
548548 ;
549549 ; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
550550 ; AVX512VBMI: # BB#0:
551 ; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126]
551 ; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126]
552552 ; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
553553 ; AVX512VBMI-NEXT: retq
554554 %shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32>
546546 ; X32-LABEL: combine_vpermt2var_32i16_identity_mask:
547547 ; X32: # BB#0:
548548 ; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
549 ; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
549 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
550550 ; X32-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
551 ; X32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
551 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
552552 ; X32-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
553553 ; X32-NEXT: retl
554554 ;
555555 ; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
556556 ; X64: # BB#0:
557557 ; X64-NEXT: kmovd %edi, %k1
558 ; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
558 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
559559 ; X64-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
560 ; X64-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
560 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
561561 ; X64-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
562562 ; X64-NEXT: retq
563563 %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> , <32 x i16> %x0, <32 x i16> %x1, i32 %m)
583583 ; X32-LABEL: combine_pshufb_identity_mask:
584584 ; X32: # BB#0:
585585 ; X32-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
586 ; X32-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
586 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
587587 ; X32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
588588 ; X32-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
589589 ; X32-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
594594 ; X64-LABEL: combine_pshufb_identity_mask:
595595 ; X64: # BB#0:
596596 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
597 ; X64-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
597 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
598598 ; X64-NEXT: kmovq %rdi, %k1
599599 ; X64-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
600600 ; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
10081008 define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
10091009 ; X32-LABEL: combine_vpermi2var_32i16_as_permw:
10101010 ; X32: # BB#0:
1011 ; X32-NEXT: vmovdqu16 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
1011 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
10121012 ; X32-NEXT: vpermw %zmm0, %zmm1, %zmm0
10131013 ; X32-NEXT: retl
10141014 ;
10151015 ; X64-LABEL: combine_vpermi2var_32i16_as_permw:
10161016 ; X64: # BB#0:
1017 ; X64-NEXT: vmovdqu16 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
1017 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
10181018 ; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0
10191019 ; X64-NEXT: retq
10201020 %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> , <32 x i16> %x1, i32 -1)
10611061 define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
10621062 ; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
10631063 ; X32: # BB#0:
1064 ; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
1064 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
10651065 ; X32-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
10661066 ; X32-NEXT: vmovdqa64 %zmm2, %zmm0
10671067 ; X32-NEXT: retl
10681068 ;
10691069 ; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
10701070 ; X64: # BB#0:
1071 ; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
1071 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
10721072 ; X64-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
10731073 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
10741074 ; X64-NEXT: retq
2121 ; X32-LABEL: combine_vpermt2var_16i16_identity_mask:
2222 ; X32: # BB#0:
2323 ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
24 ; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
24 ; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
2525 ; X32-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
26 ; X32-NEXT: vmovdqu {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
26 ; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
2727 ; X32-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z}
2828 ; X32-NEXT: retl
2929 ;
3030 ; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
3131 ; X64: # BB#0:
3232 ; X64-NEXT: kmovd %edi, %k1
33 ; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
33 ; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
3434 ; X64-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
35 ; X64-NEXT: vmovdqu {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
35 ; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
3636 ; X64-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z}
3737 ; X64-NEXT: retq
3838 %res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> , <16 x i16> %x0, <16 x i16> %x1, i16 %m)
4343 define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> %x1) {
4444 ; X32-LABEL: combine_vpermi2var_16i16_as_permw:
4545 ; X32: # BB#0:
46 ; X32-NEXT: vmovdqu {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
46 ; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
4747 ; X32-NEXT: vpermw %ymm0, %ymm1, %ymm0
4848 ; X32-NEXT: retl
4949 ;
5050 ; X64-LABEL: combine_vpermi2var_16i16_as_permw:
5151 ; X64: # BB#0:
52 ; X64-NEXT: vmovdqu {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
52 ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
5353 ; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0
5454 ; X64-NEXT: retq
5555 %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1)
6060 define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, <16 x i16> %x1) {
6161 ; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2:
6262 ; X32: # BB#0:
63 ; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
63 ; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
6464 ; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
6565 ; X32-NEXT: retl
6666 ;
6767 ; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2:
6868 ; X64: # BB#0:
69 ; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
69 ; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
7070 ; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
7171 ; X64-NEXT: retq
7272 %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> , <16 x i16> %x1, i16 -1)
3636 ; X32-LABEL: combine_vpermt2var_16i8_identity_mask:
3737 ; X32: # BB#0:
3838 ; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
39 ; X32-NEXT: vmovdqu {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
39 ; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
4040 ; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
41 ; X32-NEXT: vmovdqu {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
41 ; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
4242 ; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}
4343 ; X32-NEXT: retl
4444 ;
4545 ; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
4646 ; X64: # BB#0:
4747 ; X64-NEXT: kmovd %edi, %k1
48 ; X64-NEXT: vmovdqu {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
48 ; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
4949 ; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
50 ; X64-NEXT: vmovdqu {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
50 ; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
5151 ; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}
5252 ; X64-NEXT: retq
5353 %res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> , <16 x i8> %x0, <16 x i8> %x1, i16 %m)
7272 define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1) {
7373 ; X32-LABEL: combine_vpermi2var_32i8_as_vpermb:
7474 ; X32: # BB#0:
75 ; X32-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
75 ; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
7676 ; X32-NEXT: vpermb %ymm0, %ymm1, %ymm0
7777 ; X32-NEXT: retl
7878 ;
7979 ; X64-LABEL: combine_vpermi2var_32i8_as_vpermb:
8080 ; X64: # BB#0:
81 ; X64-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
81 ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
8282 ; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0
8383 ; X64-NEXT: retq
8484 %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32>
8888 define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1) {
8989 ; X32-LABEL: combine_vpermi2var_64i8_as_vpermb:
9090 ; X32: # BB#0:
91 ; X32-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
91 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
9292 ; X32-NEXT: vpermb %zmm0, %zmm1, %zmm0
9393 ; X32-NEXT: retl
9494 ;
9595 ; X64-LABEL: combine_vpermi2var_64i8_as_vpermb:
9696 ; X64: # BB#0:
97 ; X64-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
97 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
9898 ; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0
9999 ; X64-NEXT: retq
100100 %res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32>
105105 define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <16 x i8> %x1) {
106106 ; X32-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
107107 ; X32: # BB#0:
108 ; X32-NEXT: vmovdqu {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
108 ; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
109109 ; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2
110 ; X32-NEXT: vmovdqu {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
110 ; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
111111 ; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0
112112 ; X32-NEXT: retl
113113 ;
114114 ; X64-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
115115 ; X64: # BB#0:
116 ; X64-NEXT: vmovdqu {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
116 ; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
117117 ; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2
118 ; X64-NEXT: vmovdqu {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
118 ; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
119119 ; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0
120120 ; X64-NEXT: retq
121121 %res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> , <16 x i8> %x1, i16 -1)
125125 define <32 x i8> @combine_vpermi2var_32i8_as_vperm2(<32 x i8> %x0, <32 x i8> %x1) {
126126 ; X32-LABEL: combine_vpermi2var_32i8_as_vperm2:
127127 ; X32: # BB#0:
128 ; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
128 ; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
129129 ; X32-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
130130 ; X32-NEXT: retl
131131 ;
132132 ; X64-LABEL: combine_vpermi2var_32i8_as_vperm2:
133133 ; X64: # BB#0:
134 ; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
134 ; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
135135 ; X64-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
136136 ; X64-NEXT: retq
137137 %res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32>
141141 define <64 x i8> @combine_vpermi2var_64i8_as_vperm2(<64 x i8> %x0, <64 x i8> %x1) {
142142 ; X32-LABEL: combine_vpermi2var_64i8_as_vperm2:
143143 ; X32: # BB#0:
144 ; X32-NEXT: vmovdqu8 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
144 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
145145 ; X32-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
146146 ; X32-NEXT: retl
147147 ;
148148 ; X64-LABEL: combine_vpermi2var_64i8_as_vperm2:
149149 ; X64: # BB#0:
150 ; X64-NEXT: vmovdqu8 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
150 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
151151 ; X64-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
152152 ; X64-NEXT: retq
153153 %res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32>
143143 ; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
144144 ; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
145145 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
146 ; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
146 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
147147 ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
148148 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
149149 ; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
11131113 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
11141114 ; AVX-NEXT: retq
11151115 ;
1116 ; AVX512F-LABEL: trunc2x8i16_16i8:
1117 ; AVX512F: # BB#0: # %entry
1118 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1119 ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1120 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1121 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1122 ; AVX512F-NEXT: retq
1123 ;
1124 ; AVX512VL-LABEL: trunc2x8i16_16i8:
1125 ; AVX512VL: # BB#0: # %entry
1126 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1127 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1128 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1129 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1130 ; AVX512VL-NEXT: retq
1131 ;
1132 ; AVX512BW-LABEL: trunc2x8i16_16i8:
1133 ; AVX512BW: # BB#0: # %entry
1134 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1135 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1136 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1137 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1138 ; AVX512BW-NEXT: retq
1139 ;
1140 ; AVX512BWVL-LABEL: trunc2x8i16_16i8:
1141 ; AVX512BWVL: # BB#0: # %entry
1142 ; AVX512BWVL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1143 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1144 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1145 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1146 ; AVX512BWVL-NEXT: retq
1116 ; AVX512-LABEL: trunc2x8i16_16i8:
1117 ; AVX512: # BB#0: # %entry
1118 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
1119 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1120 ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1121 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1122 ; AVX512-NEXT: retq
11471123 entry:
11481124 %0 = trunc <8 x i16> %a to <8 x i8>
11491125 %1 = trunc <8 x i16> %b to <8 x i8>
4141 ; AVX512CDBW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
4242 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
4343 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
44 ; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
44 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
4545 ; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
4646 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
4747 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
5959 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
6060 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
6161 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
62 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
62 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
6363 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
6464 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
6565 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
111111 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
112112 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
113113 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
114 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
114 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
115115 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
116116 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
117117 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
179179 ; AVX512CDBW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
180180 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
181181 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
182 ; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
182 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
183183 ; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
184184 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
185185 ; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
201201 ; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
202202 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
203203 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
204 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
204 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
205205 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
206206 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
207207 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
257257 ; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
258258 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
259259 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
260 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
260 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
261261 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
262262 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
263263 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
325325 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
326326 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
327327 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
328 ; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
328 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
329329 ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
330330 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
331331 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
345345 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
346346 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
347347 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
348 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
348 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
349349 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
350350 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
351351 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
419419 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
420420 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
421421 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
422 ; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
422 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
423423 ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
424424 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
425425 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
439439 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
440440 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
441441 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
442 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
442 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
443443 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
444444 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
445445 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
507507 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
508508 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
509509 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
510 ; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
510 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
511511 ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
512512 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
513513 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
524524 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
525525 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
526526 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
527 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
527 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
528528 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
529529 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
530530 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
597597 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
598598 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
599599 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
600 ; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
600 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
601601 ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
602602 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
603603 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
614614 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
615615 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
616616 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
617 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
617 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
618618 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
619619 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
620620 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0