llvm.org GIT mirror llvm / 201727a
[AVX-512] Teach EVEX to VEX conversion pass to handle VINSERT and VEXTRACT instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290869 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 3 years ago
21 changed file(s) with 613 addition(s) and 1200 deletion(s). Raw diff Collapse all Expand all
754754 { X86::VDIVPDZ256rr , X86::VDIVPDYrr },
755755 { X86::VDIVPSZ256rm , X86::VDIVPSYrm },
756756 { X86::VDIVPSZ256rr , X86::VDIVPSYrr },
757 { X86::VEXTRACTF32x4Z256mr , X86::VEXTRACTF128mr },
758 { X86::VEXTRACTF64x2Z256mr , X86::VEXTRACTF128mr },
759 { X86::VEXTRACTF32x4Z256rr , X86::VEXTRACTF128rr },
760 { X86::VEXTRACTF64x2Z256rr , X86::VEXTRACTF128rr },
761 { X86::VEXTRACTI32x4Z256mr , X86::VEXTRACTI128mr },
762 { X86::VEXTRACTI64x2Z256mr , X86::VEXTRACTI128mr },
763 { X86::VEXTRACTI32x4Z256rr , X86::VEXTRACTI128rr },
764 { X86::VEXTRACTI64x2Z256rr , X86::VEXTRACTI128rr },
757765 { X86::VFMADD132PDZ256m , X86::VFMADD132PDYm },
758766 { X86::VFMADD132PDZ256r , X86::VFMADD132PDYr },
759767 { X86::VFMADD132PSZ256m , X86::VFMADD132PSYm },
826834 { X86::VFNMSUB231PDZ256r , X86::VFNMSUB231PDYr },
827835 { X86::VFNMSUB231PSZ256m , X86::VFNMSUB231PSYm },
828836 { X86::VFNMSUB231PSZ256r , X86::VFNMSUB231PSYr },
837 { X86::VINSERTF32x4Z256rm , X86::VINSERTF128rm },
838 { X86::VINSERTF64x2Z256rm , X86::VINSERTF128rm },
839 { X86::VINSERTF32x4Z256rr , X86::VINSERTF128rr },
840 { X86::VINSERTF64x2Z256rr , X86::VINSERTF128rr },
841 { X86::VINSERTI32x4Z256rm , X86::VINSERTI128rm },
842 { X86::VINSERTI64x2Z256rm , X86::VINSERTI128rm },
843 { X86::VINSERTI32x4Z256rr , X86::VINSERTI128rr },
844 { X86::VINSERTI64x2Z256rr , X86::VINSERTI128rr },
829845 { X86::VMAXCPDZ256rm , X86::VMAXCPDYrm },
830846 { X86::VMAXCPDZ256rr , X86::VMAXCPDYrr },
831847 { X86::VMAXCPSZ256rm , X86::VMAXCPSYrm },
208208 }
209209
210210 define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
211 ; X32-AVX2-LABEL: QQ64:
212 ; X32-AVX2: ## BB#0: ## %entry
213 ; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
214 ; X32-AVX2-NEXT: movl (%eax), %ecx
215 ; X32-AVX2-NEXT: movl 4(%eax), %eax
216 ; X32-AVX2-NEXT: vmovd %ecx, %xmm0
217 ; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
218 ; X32-AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
219 ; X32-AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
220 ; X32-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
221 ; X32-AVX2-NEXT: retl
211 ; X32-LABEL: QQ64:
212 ; X32: ## BB#0: ## %entry
213 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
214 ; X32-NEXT: movl (%eax), %ecx
215 ; X32-NEXT: movl 4(%eax), %eax
216 ; X32-NEXT: vmovd %ecx, %xmm0
217 ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
218 ; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
219 ; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
220 ; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
221 ; X32-NEXT: retl
222222 ;
223223 ; X64-LABEL: QQ64:
224224 ; X64: ## BB#0: ## %entry
225225 ; X64-NEXT: vbroadcastsd (%rdi), %ymm0
226226 ; X64-NEXT: retq
227 ;
228 ; X32-AVX512VL-LABEL: QQ64:
229 ; X32-AVX512VL: ## BB#0: ## %entry
230 ; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
231 ; X32-AVX512VL-NEXT: movl (%eax), %ecx
232 ; X32-AVX512VL-NEXT: movl 4(%eax), %eax
233 ; X32-AVX512VL-NEXT: vmovd %ecx, %xmm0
234 ; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
235 ; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
236 ; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
237 ; X32-AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
238 ; X32-AVX512VL-NEXT: retl
239227 entry:
240228 %q = load i64, i64* %ptr, align 4
241229 %q0 = insertelement <4 x i64> undef, i64 %q, i32 0
16821670 ; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
16831671 ; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
16841672 ; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
1685 ; X32-AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1
1673 ; X32-AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
16861674 ; X32-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
16871675 ; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
16881676 ; X32-AVX512VL-NEXT: movl %ebp, %esp
5959 define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
6060 ; SKX-LABEL: extract_subvector256_v8f64_store:
6161 ; SKX: ## BB#0: ## %entry
62 ; SKX-NEXT: vextractf64x2 $1, %ymm0, (%rdi)
62 ; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
6363 ; SKX-NEXT: retq
6464 entry:
6565 %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32>
7171 define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
7272 ; SKX-LABEL: extract_subvector256_v8f32_store:
7373 ; SKX: ## BB#0: ## %entry
74 ; SKX-NEXT: vextractf32x4 $1, %ymm0, (%rdi)
74 ; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
7575 ; SKX-NEXT: retq
7676 entry:
7777 %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32>
8383 define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
8484 ; SKX-LABEL: extract_subvector256_v4i64_store:
8585 ; SKX: ## BB#0: ## %entry
86 ; SKX-NEXT: vextracti64x2 $1, %ymm0, (%rdi)
86 ; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
8787 ; SKX-NEXT: retq
8888 entry:
8989 %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32>
9595 define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
9696 ; SKX-LABEL: extract_subvector256_v8i32_store:
9797 ; SKX: ## BB#0: ## %entry
98 ; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
98 ; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
9999 ; SKX-NEXT: retq
100100 entry:
101101 %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32>
107107 define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
108108 ; SKX-LABEL: extract_subvector256_v16i16_store:
109109 ; SKX: ## BB#0: ## %entry
110 ; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
110 ; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
111111 ; SKX-NEXT: retq
112112 entry:
113113 %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32>
119119 define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
120120 ; SKX-LABEL: extract_subvector256_v32i8_store:
121121 ; SKX: ## BB#0: ## %entry
122 ; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
122 ; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
123123 ; SKX-NEXT: retq
124124 entry:
125125 %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32>
462462 ; SKX-LABEL: extract_v4i64:
463463 ; SKX: ## BB#0:
464464 ; SKX-NEXT: vpextrq $1, %xmm0, %rax
465 ; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0
465 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
466466 ; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
467467 ; SKX-NEXT: retq
468468 %r1 = extractelement <4 x i64> %x, i32 1
520520 ; SKX-LABEL: extract_v8i32:
521521 ; SKX: ## BB#0:
522522 ; SKX-NEXT: vpextrd $1, %xmm0, %eax
523 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
523 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
524524 ; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
525525 ; SKX-NEXT: retq
526526 %r1 = extractelement <8 x i32> %x, i32 1
581581 ; SKX-LABEL: extract_v16i16:
582582 ; SKX: ## BB#0:
583583 ; SKX-NEXT: vpextrw $1, %xmm0, %eax
584 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
584 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
585585 ; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
586586 ; SKX-NEXT: ## kill: %AX %AX %EAX
587587 ; SKX-NEXT: retq
645645 ; SKX-LABEL: extract_v32i8:
646646 ; SKX: ## BB#0:
647647 ; SKX-NEXT: vpextrb $1, %xmm0, %eax
648 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
648 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
649649 ; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
650650 ; SKX-NEXT: ## kill: %AL %AL %EAX
651651 ; SKX-NEXT: retq
713713 ; SKX: ## BB#0:
714714 ; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
715715 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
716 ; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm1
716 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
717717 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
718 ; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0
718 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
719719 ; SKX-NEXT: retq
720720 %val = load i64, i64* %ptr
721721 %r1 = insertelement <4 x i64> %x, i64 %val, i32 1
779779 ; SKX: ## BB#0:
780780 ; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
781781 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
782 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
782 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
783783 ; SKX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1
784 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
784 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
785785 ; SKX-NEXT: retq
786786 %val = load i32, i32* %ptr
787787 %r1 = insertelement <8 x i32> %x, i32 %val, i32 1
845845 ; SKX: ## BB#0:
846846 ; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
847847 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
848 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
848 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
849849 ; SKX-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1
850 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
850 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
851851 ; SKX-NEXT: retq
852852 %val = load i16, i16* %ptr
853853 %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
911911 ; SKX: ## BB#0:
912912 ; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
913913 ; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
914 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
914 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
915915 ; SKX-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1
916 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
916 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
917917 ; SKX-NEXT: retq
918918 %val = load i8, i8* %ptr
919919 %r1 = insertelement <32 x i8> %x, i8 %val, i32 1
10131013 ;
10141014 ; SKX-LABEL: test_insert_128_v16i16:
10151015 ; SKX: ## BB#0:
1016 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
1016 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
10171017 ; SKX-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
1018 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1018 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
10191019 ; SKX-NEXT: retq
10201020 %r = insertelement <16 x i16> %x, i16 %y, i32 10
10211021 ret <16 x i16> %r
10311031 ;
10321032 ; SKX-LABEL: test_insert_128_v32i8:
10331033 ; SKX: ## BB#0:
1034 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
1034 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
10351035 ; SKX-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
1036 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1036 ; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
10371037 ; SKX-NEXT: retq
10381038 %r = insertelement <32 x i8> %x, i8 %y, i32 20
10391039 ret <32 x i8> %r
236236 ; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
237237 ; X64-AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
238238 ; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
239 ; X64-AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
239 ; X64-AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
240240 ; X64-AVX512VL-NEXT: retq
241241 ;
242242 ; X64-AVX512BWVL-LABEL: PR29088:
244244 ; X64-AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
245245 ; X64-AVX512BWVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
246246 ; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
247 ; X64-AVX512BWVL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
247 ; X64-AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
248248 ; X64-AVX512BWVL-NEXT: retq
249249 ;
250250 ; X64-AVX512DQVL-LABEL: PR29088:
252252 ; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
253253 ; X64-AVX512DQVL-NEXT: vxorps %ymm1, %ymm1, %ymm1
254254 ; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi)
255 ; X64-AVX512DQVL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
255 ; X64-AVX512DQVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
256256 ; X64-AVX512DQVL-NEXT: retq
257257 %ld = load <4 x i32>, <4 x i32>* %p0
258258 store <8 x float> zeroinitializer, <8 x float>* %p1
2929 ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
3030 ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
3131 ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
32 ; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
32 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
3333 ; CHECK-NEXT: retq ## encoding: [0xc3]
3434 %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
3535 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
7878 ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
7979 ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
8080 ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
81 ; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
81 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
8282 ; CHECK-NEXT: retq ## encoding: [0xc3]
8383 %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
8484 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
128128 ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
129129 ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
130130 ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
131 ; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
131 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
132132 ; CHECK-NEXT: retq ## encoding: [0xc3]
133133 %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
134134 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
177177 ; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
178178 ; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
179179 ; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
180 ; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
180 ; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
181181 ; CHECK-NEXT: retq ## encoding: [0xc3]
182182 %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
183183 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
15641564 define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
15651565 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
15661566 ; CHECK: ## BB#0:
1567 ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc2,0x01]
1567 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
15681568 ; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
15691569 ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
15701570 ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
15841584 define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
15851585 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
15861586 ; CHECK: ## BB#0:
1587 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x18,0xd9,0x01]
1587 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
15881588 ; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
15891589 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
15901590 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
16041604 define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
16051605 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
16061606 ; CHECK: ## BB#0:
1607 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x38,0xd9,0x01]
1607 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
16081608 ; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
16091609 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
16101610 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
47774777 define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
47784778 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
47794779 ; CHECK: ## BB#0:
4780 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 ## encoding: [0x62,0xf3,0x7d,0x28,0x19,0xc2,0x01]
4780 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
47814781 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
47824782 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01]
47834783 ; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01]
47974797 define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
47984798 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
47994799 ; CHECK: ## BB#0:
4800 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x18,0xd9,0x01]
4800 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
48014801 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
48024802 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
48034803 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01]
48174817 define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
48184818 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
48194819 ; CHECK: ## BB#0:
4820 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x38,0xd9,0x01]
4820 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
48214821 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
48224822 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
48234823 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01]
10081008 ;
10091009 ; SKX-LABEL: one_mask_bit_set3:
10101010 ; SKX: ## BB#0:
1011 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
1011 ; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
10121012 ; SKX-NEXT: vmovq %xmm0, 16(%rdi)
10131013 ; SKX-NEXT: retq
10141014 call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1>)
10251025 ; AVX-NEXT: vzeroupper
10261026 ; AVX-NEXT: retq
10271027 ;
1028 ; AVX512F-LABEL: one_mask_bit_set4:
1029 ; AVX512F: ## BB#0:
1030 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
1031 ; AVX512F-NEXT: vmovhpd %xmm0, 24(%rdi)
1032 ; AVX512F-NEXT: retq
1033 ;
1034 ; SKX-LABEL: one_mask_bit_set4:
1035 ; SKX: ## BB#0:
1036 ; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0
1037 ; SKX-NEXT: vmovhpd %xmm0, 24(%rdi)
1038 ; SKX-NEXT: retq
1028 ; AVX512-LABEL: one_mask_bit_set4:
1029 ; AVX512: ## BB#0:
1030 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1031 ; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
1032 ; AVX512-NEXT: retq
10391033 call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1>)
10401034 ret void
10411035 }
11081102 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
11091103 ; AVX2-NEXT: retq
11101104 ;
1111 ; AVX512F-LABEL: load_one_mask_bit_set3:
1112 ; AVX512F: ## BB#0:
1113 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
1114 ; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
1115 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1116 ; AVX512F-NEXT: retq
1117 ;
1118 ; SKX-LABEL: load_one_mask_bit_set3:
1119 ; SKX: ## BB#0:
1120 ; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
1121 ; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
1122 ; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1123 ; SKX-NEXT: retq
1105 ; AVX512-LABEL: load_one_mask_bit_set3:
1106 ; AVX512: ## BB#0:
1107 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1108 ; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
1109 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1110 ; AVX512-NEXT: retq
11241111 %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1>, <4 x i64> %val)
11251112 ret <4 x i64> %res
11261113 }
11351122 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
11361123 ; AVX-NEXT: retq
11371124 ;
1138 ; AVX512F-LABEL: load_one_mask_bit_set4:
1139 ; AVX512F: ## BB#0:
1140 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
1141 ; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1142 ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1143 ; AVX512F-NEXT: retq
1144 ;
1145 ; SKX-LABEL: load_one_mask_bit_set4:
1146 ; SKX: ## BB#0:
1147 ; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm1
1148 ; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1149 ; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
1150 ; SKX-NEXT: retq
1125 ; AVX512-LABEL: load_one_mask_bit_set4:
1126 ; AVX512: ## BB#0:
1127 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1128 ; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1129 ; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1130 ; AVX512-NEXT: retq
11511131 %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1>, <4 x double> %val)
11521132 ret <4 x double> %res
11531133 }
487487
488488 define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) {
489489 ;CHECK-LABEL: stack_fold_extractf32x4
490 ;CHECK: vextractf32x4 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
490 ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
491491 %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32>
492492 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
493493 ret <4 x float> %1
495495
496496 define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1) {
497497 ;CHECK-LABEL: stack_fold_extractf64x2
498 ;CHECK: vextractf64x2 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
498 ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
499499 %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <2 x i32>
500500 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
501501 ret <2 x double> %1
503503
504504 define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) {
505505 ;CHECK-LABEL: stack_fold_insertf32x4
506 ;CHECK: vinsertf32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
506 ;CHECK: vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
507507 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
508508 %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32>
509509 ret <8 x float> %2
511511
512512 define <4 x double> @stack_fold_insertf64x2(<2 x double> %a0, <2 x double> %a1) {
513513 ;CHECK-LABEL: stack_fold_insertf64x2
514 ;CHECK: vinsertf64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
514 ;CHECK: vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
515515 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
516516 %2 = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32>
517517 ret <4 x double> %2
444444
445445 define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) {
446446 ;CHECK-LABEL: stack_fold_extracti32x4
447 ;CHECK: vextracti32x4 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
447 ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
448448 ; add forces execution domain
449449 %1 = add <8 x i32> %a0,
450450 %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32>
454454
455455 define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) {
456456 ;CHECK-LABEL: stack_fold_extracti64x2
457 ;CHECK: vextracti64x2 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
457 ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
458458 ; add forces execution domain
459459 %1 = add <4 x i64> %a0,
460460 %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32>
464464
465465 define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) {
466466 ;CHECK-LABEL: stack_fold_inserti32x4
467 ;CHECK: vinserti32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
467 ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
468468 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
469469 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32>
470470 ; add forces execution domain
474474
475475 define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) {
476476 ;CHECK-LABEL: stack_fold_inserti64x2
477 ;CHECK: vinserti64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
477 ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
478478 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
479479 %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32>
480480 ; add forces execution domain
831831 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
832832 ; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0
833833 ; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax)
834 ; X32-AVX512F-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
834 ; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
835835 ; X32-AVX512F-NEXT: retl
836836 ;
837837 ; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse:
840840 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
841841 ; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0
842842 ; X32-AVX512BW-NEXT: vmovaps %xmm0, (%eax)
843 ; X32-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
843 ; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
844844 ; X32-AVX512BW-NEXT: retl
845845 ;
846846 ; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse:
849849 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
850850 ; X32-AVX512DQ-NEXT: vmovapd (%ecx), %xmm0
851851 ; X32-AVX512DQ-NEXT: vmovapd %xmm0, (%eax)
852 ; X32-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0
852 ; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
853853 ; X32-AVX512DQ-NEXT: retl
854854 ;
855855 ; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
863863 ; X64-AVX512F: ## BB#0:
864864 ; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0
865865 ; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi)
866 ; X64-AVX512F-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
866 ; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
867867 ; X64-AVX512F-NEXT: retq
868868 ;
869869 ; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse:
870870 ; X64-AVX512BW: ## BB#0:
871871 ; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0
872872 ; X64-AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
873 ; X64-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
873 ; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
874874 ; X64-AVX512BW-NEXT: retq
875875 ;
876876 ; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse:
877877 ; X64-AVX512DQ: ## BB#0:
878878 ; X64-AVX512DQ-NEXT: vmovapd (%rdi), %xmm0
879879 ; X64-AVX512DQ-NEXT: vmovapd %xmm0, (%rsi)
880 ; X64-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0
880 ; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
881881 ; X64-AVX512DQ-NEXT: retq
882882 %1 = load <2 x double>, <2 x double>* %p0
883883 store <2 x double> %1, <2 x double>* %p1
895895 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
896896 ; X32-AVX-NEXT: retl
897897 ;
898 ; X32-AVX512F-LABEL: test_broadcast_2i64_4i64_reuse:
899 ; X32-AVX512F: ## BB#0:
900 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
901 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
902 ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
903 ; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax)
904 ; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
905 ; X32-AVX512F-NEXT: retl
906 ;
907 ; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse:
908 ; X32-AVX512BW: ## BB#0:
909 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
910 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
911 ; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
912 ; X32-AVX512BW-NEXT: vmovdqa %xmm0, (%eax)
913 ; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
914 ; X32-AVX512BW-NEXT: retl
915 ;
916 ; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse:
917 ; X32-AVX512DQ: ## BB#0:
918 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
919 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
920 ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
921 ; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax)
922 ; X32-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0
923 ; X32-AVX512DQ-NEXT: retl
898 ; X32-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
899 ; X32-AVX512: ## BB#0:
900 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
901 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
902 ; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0
903 ; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax)
904 ; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
905 ; X32-AVX512-NEXT: retl
924906 ;
925907 ; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
926908 ; X64-AVX: ## BB#0:
929911 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
930912 ; X64-AVX-NEXT: retq
931913 ;
932 ; X64-AVX512F-LABEL: test_broadcast_2i64_4i64_reuse:
933 ; X64-AVX512F: ## BB#0:
934 ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
935 ; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
936 ; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
937 ; X64-AVX512F-NEXT: retq
938 ;
939 ; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse:
940 ; X64-AVX512BW: ## BB#0:
941 ; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
942 ; X64-AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
943 ; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
944 ; X64-AVX512BW-NEXT: retq
945 ;
946 ; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse:
947 ; X64-AVX512DQ: ## BB#0:
948 ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
949 ; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
950 ; X64-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0
951 ; X64-AVX512DQ-NEXT: retq
914 ; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
915 ; X64-AVX512: ## BB#0:
916 ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0
917 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
918 ; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
919 ; X64-AVX512-NEXT: retq
952920 %1 = load <2 x i64>, <2 x i64>* %p0
953921 store <2 x i64> %1, <2 x i64>* %p1
954922 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32>
956924 }
957925
958926 define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
959 ; X32-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
960 ; X32-AVX: ## BB#0:
961 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
962 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
963 ; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
964 ; X32-AVX-NEXT: vmovaps %xmm0, (%eax)
965 ; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
966 ; X32-AVX-NEXT: retl
967 ;
968 ; X32-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
969 ; X32-AVX512: ## BB#0:
970 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
971 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
972 ; X32-AVX512-NEXT: vmovaps (%ecx), %xmm0
973 ; X32-AVX512-NEXT: vmovaps %xmm0, (%eax)
974 ; X32-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
975 ; X32-AVX512-NEXT: retl
976 ;
977 ; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
978 ; X64-AVX: ## BB#0:
979 ; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
980 ; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
981 ; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
982 ; X64-AVX-NEXT: retq
983 ;
984 ; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
985 ; X64-AVX512: ## BB#0:
986 ; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0
987 ; X64-AVX512-NEXT: vmovaps %xmm0, (%rsi)
988 ; X64-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
989 ; X64-AVX512-NEXT: retq
927 ; X32-LABEL: test_broadcast_4f32_8f32_reuse:
928 ; X32: ## BB#0:
929 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
930 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
931 ; X32-NEXT: vmovaps (%ecx), %xmm0
932 ; X32-NEXT: vmovaps %xmm0, (%eax)
933 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
934 ; X32-NEXT: retl
935 ;
936 ; X64-LABEL: test_broadcast_4f32_8f32_reuse:
937 ; X64: ## BB#0:
938 ; X64-NEXT: vmovaps (%rdi), %xmm0
939 ; X64-NEXT: vmovaps %xmm0, (%rsi)
940 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
941 ; X64-NEXT: retq
990942 %1 = load <4 x float>, <4 x float>* %p0
991943 store <4 x float> %1, <4 x float>* %p1
992944 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32>
1009961 ; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1010962 ; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0
1011963 ; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax)
1012 ; X32-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
964 ; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1013965 ; X32-AVX512-NEXT: retl
1014966 ;
1015967 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
1023975 ; X64-AVX512: ## BB#0:
1024976 ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0
1025977 ; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
1026 ; X64-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
978 ; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1027979 ; X64-AVX512-NEXT: retq
1028980 %1 = load <4 x i32>, <4 x i32>* %p0
1029981 store <4 x i32> %1, <4 x i32>* %p1
1047999 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
10481000 ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
10491001 ; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax)
1050 ; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1002 ; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10511003 ; X32-AVX512F-NEXT: retl
10521004 ;
10531005 ; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
10561008 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
10571009 ; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0
10581010 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
1059 ; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1011 ; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10601012 ; X32-AVX512BW-NEXT: retl
10611013 ;
10621014 ; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
10651017 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
10661018 ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
10671019 ; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax)
1068 ; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1020 ; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10691021 ; X32-AVX512DQ-NEXT: retl
10701022 ;
10711023 ; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
10791031 ; X64-AVX512F: ## BB#0:
10801032 ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
10811033 ; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
1082 ; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1034 ; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10831035 ; X64-AVX512F-NEXT: retq
10841036 ;
10851037 ; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
10861038 ; X64-AVX512BW: ## BB#0:
10871039 ; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
10881040 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
1089 ; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1041 ; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10901042 ; X64-AVX512BW-NEXT: retq
10911043 ;
10921044 ; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
10931045 ; X64-AVX512DQ: ## BB#0:
10941046 ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
10951047 ; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
1096 ; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1048 ; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10971049 ; X64-AVX512DQ-NEXT: retq
10981050 %1 = load <8 x i16>, <8 x i16> *%p0
10991051 store <8 x i16> %1, <8 x i16>* %p1
11171069 ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
11181070 ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
11191071 ; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax)
1120 ; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1072 ; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11211073 ; X32-AVX512F-NEXT: retl
11221074 ;
11231075 ; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
11261078 ; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
11271079 ; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0
11281080 ; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
1129 ; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1081 ; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11301082 ; X32-AVX512BW-NEXT: retl
11311083 ;
11321084 ; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
11351087 ; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
11361088 ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
11371089 ; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax)
1138 ; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1090 ; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11391091 ; X32-AVX512DQ-NEXT: retl
11401092 ;
11411093 ; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
11491101 ; X64-AVX512F: ## BB#0:
11501102 ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
11511103 ; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
1152 ; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1104 ; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11531105 ; X64-AVX512F-NEXT: retq
11541106 ;
11551107 ; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
11561108 ; X64-AVX512BW: ## BB#0:
11571109 ; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
11581110 ; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
1159 ; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1111 ; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11601112 ; X64-AVX512BW-NEXT: retq
11611113 ;
11621114 ; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
11631115 ; X64-AVX512DQ: ## BB#0:
11641116 ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
11651117 ; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
1166 ; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1118 ; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11671119 ; X64-AVX512DQ-NEXT: retq
11681120 %1 = load <16 x i8>, <16 x i8> *%p0
11691121 store <16 x i8> %1, <16 x i8>* %p1
11931145 ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
11941146 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
11951147 ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax)
1196 ; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1148 ; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11971149 ; X32-AVX512F-NEXT: retl
11981150 ;
11991151 ; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
12031155 ; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
12041156 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
12051157 ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax)
1206 ; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1158 ; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12071159 ; X32-AVX512BW-NEXT: retl
12081160 ;
12091161 ; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
12131165 ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
12141166 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
12151167 ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax)
1216 ; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1168 ; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12171169 ; X32-AVX512DQ-NEXT: retl
12181170 ;
12191171 ; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
12291181 ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
12301182 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
12311183 ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
1232 ; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1184 ; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12331185 ; X64-AVX512F-NEXT: retq
12341186 ;
12351187 ; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
12371189 ; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
12381190 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
12391191 ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
1240 ; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1192 ; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12411193 ; X64-AVX512BW-NEXT: retq
12421194 ;
12431195 ; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
12451197 ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
12461198 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
12471199 ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
1248 ; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
1200 ; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12491201 ; X64-AVX512DQ-NEXT: retq
12501202 %1 = load <4 x i32>, <4 x i32>* %p0
12511203 store <4 x float> zeroinitializer, <4 x float>* %p1
203203 ;
204204 ; AVX512VL-LABEL: fptosi_4f64_to_4i64:
205205 ; AVX512VL: # BB#0:
206 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
206 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
207207 ; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
208208 ; AVX512VL-NEXT: vmovq %rax, %xmm2
209209 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
216216 ; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
217217 ; AVX512VL-NEXT: vmovq %rax, %xmm0
218218 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
219 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
219 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
220220 ; AVX512VL-NEXT: retq
221221 ;
222222 ; AVX512DQ-LABEL: fptosi_4f64_to_4i64:
718718 ;
719719 ; AVX512VL-LABEL: fptoui_4f64_to_4i64:
720720 ; AVX512VL: # BB#0:
721 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
721 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
722722 ; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
723723 ; AVX512VL-NEXT: vmovq %rax, %xmm2
724724 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
731731 ; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
732732 ; AVX512VL-NEXT: vmovq %rax, %xmm0
733733 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
734 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
734 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
735735 ; AVX512VL-NEXT: retq
736736 ;
737737 ; AVX512DQ-LABEL: fptoui_4f64_to_4i64:
10961096 ; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
10971097 ; AVX512VL-NEXT: vmovq %rax, %xmm0
10981098 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
1099 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1099 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
11001100 ; AVX512VL-NEXT: retq
11011101 ;
11021102 ; AVX512DQ-LABEL: fptosi_4f32_to_4i64:
12041204 ; AVX512VL-NEXT: vmovq %rcx, %xmm1
12051205 ; AVX512VL-NEXT: vmovq %rax, %xmm2
12061206 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1207 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0
1207 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
12081208 ; AVX512VL-NEXT: retq
12091209 ;
12101210 ; AVX512DQ-LABEL: fptosi_8f32_to_4i64:
18211821 ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
18221822 ; AVX512VL-NEXT: vmovq %rax, %xmm0
18231823 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
1824 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1824 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
18251825 ; AVX512VL-NEXT: retq
18261826 ;
18271827 ; AVX512DQ-LABEL: fptoui_4f32_to_4i64:
19991999 ; AVX512VL-NEXT: vmovq %rcx, %xmm1
20002000 ; AVX512VL-NEXT: vmovq %rax, %xmm2
20012001 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2002 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0
2002 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
20032003 ; AVX512VL-NEXT: retq
20042004 ;
20052005 ; AVX512DQ-LABEL: fptoui_8f32_to_4i64:
24082408 ; SSE-NEXT: popq %r14
24092409 ; SSE-NEXT: retq
24102410 ;
2411 ; VEX-LABEL: fptosi_2f128_to_4i32:
2412 ; VEX: # BB#0:
2413 ; VEX-NEXT: pushq %r14
2414 ; VEX-NEXT: pushq %rbx
2415 ; VEX-NEXT: subq $24, %rsp
2416 ; VEX-NEXT: movq %rsi, %r14
2417 ; VEX-NEXT: movq %rdi, %rbx
2418 ; VEX-NEXT: movq %rdx, %rdi
2419 ; VEX-NEXT: movq %rcx, %rsi
2420 ; VEX-NEXT: callq __fixtfdi
2421 ; VEX-NEXT: vmovq %rax, %xmm0
2422 ; VEX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
2423 ; VEX-NEXT: movq %rbx, %rdi
2424 ; VEX-NEXT: movq %r14, %rsi
2425 ; VEX-NEXT: callq __fixtfdi
2426 ; VEX-NEXT: vmovq %rax, %xmm0
2427 ; VEX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2428 ; VEX-NEXT: # xmm0 = xmm0[0],mem[0]
2429 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2430 ; VEX-NEXT: addq $24, %rsp
2431 ; VEX-NEXT: popq %rbx
2432 ; VEX-NEXT: popq %r14
2433 ; VEX-NEXT: retq
2434 ;
2435 ; AVX512F-LABEL: fptosi_2f128_to_4i32:
2436 ; AVX512F: # BB#0:
2437 ; AVX512F-NEXT: pushq %r14
2438 ; AVX512F-NEXT: pushq %rbx
2439 ; AVX512F-NEXT: subq $24, %rsp
2440 ; AVX512F-NEXT: movq %rsi, %r14
2441 ; AVX512F-NEXT: movq %rdi, %rbx
2442 ; AVX512F-NEXT: movq %rdx, %rdi
2443 ; AVX512F-NEXT: movq %rcx, %rsi
2444 ; AVX512F-NEXT: callq __fixtfdi
2445 ; AVX512F-NEXT: vmovq %rax, %xmm0
2446 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
2447 ; AVX512F-NEXT: movq %rbx, %rdi
2448 ; AVX512F-NEXT: movq %r14, %rsi
2449 ; AVX512F-NEXT: callq __fixtfdi
2450 ; AVX512F-NEXT: vmovq %rax, %xmm0
2451 ; AVX512F-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2452 ; AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
2453 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2454 ; AVX512F-NEXT: addq $24, %rsp
2455 ; AVX512F-NEXT: popq %rbx
2456 ; AVX512F-NEXT: popq %r14
2457 ; AVX512F-NEXT: retq
2458 ;
2459 ; AVX512VL-LABEL: fptosi_2f128_to_4i32:
2460 ; AVX512VL: # BB#0:
2461 ; AVX512VL-NEXT: pushq %r14
2462 ; AVX512VL-NEXT: pushq %rbx
2463 ; AVX512VL-NEXT: subq $24, %rsp
2464 ; AVX512VL-NEXT: movq %rsi, %r14
2465 ; AVX512VL-NEXT: movq %rdi, %rbx
2466 ; AVX512VL-NEXT: movq %rdx, %rdi
2467 ; AVX512VL-NEXT: movq %rcx, %rsi
2468 ; AVX512VL-NEXT: callq __fixtfdi
2469 ; AVX512VL-NEXT: vmovq %rax, %xmm0
2470 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
2471 ; AVX512VL-NEXT: movq %rbx, %rdi
2472 ; AVX512VL-NEXT: movq %r14, %rsi
2473 ; AVX512VL-NEXT: callq __fixtfdi
2474 ; AVX512VL-NEXT: vmovq %rax, %xmm0
2475 ; AVX512VL-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2476 ; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0]
2477 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2478 ; AVX512VL-NEXT: addq $24, %rsp
2479 ; AVX512VL-NEXT: popq %rbx
2480 ; AVX512VL-NEXT: popq %r14
2481 ; AVX512VL-NEXT: retq
2482 ;
2483 ; AVX512DQ-LABEL: fptosi_2f128_to_4i32:
2484 ; AVX512DQ: # BB#0:
2485 ; AVX512DQ-NEXT: pushq %r14
2486 ; AVX512DQ-NEXT: pushq %rbx
2487 ; AVX512DQ-NEXT: subq $24, %rsp
2488 ; AVX512DQ-NEXT: movq %rsi, %r14
2489 ; AVX512DQ-NEXT: movq %rdi, %rbx
2490 ; AVX512DQ-NEXT: movq %rdx, %rdi
2491 ; AVX512DQ-NEXT: movq %rcx, %rsi
2492 ; AVX512DQ-NEXT: callq __fixtfdi
2493 ; AVX512DQ-NEXT: vmovq %rax, %xmm0
2494 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
2495 ; AVX512DQ-NEXT: movq %rbx, %rdi
2496 ; AVX512DQ-NEXT: movq %r14, %rsi
2497 ; AVX512DQ-NEXT: callq __fixtfdi
2498 ; AVX512DQ-NEXT: vmovq %rax, %xmm0
2499 ; AVX512DQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2500 ; AVX512DQ-NEXT: # xmm0 = xmm0[0],mem[0]
2501 ; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2502 ; AVX512DQ-NEXT: addq $24, %rsp
2503 ; AVX512DQ-NEXT: popq %rbx
2504 ; AVX512DQ-NEXT: popq %r14
2505 ; AVX512DQ-NEXT: retq
2506 ;
2507 ; AVX512VLDQ-LABEL: fptosi_2f128_to_4i32:
2508 ; AVX512VLDQ: # BB#0:
2509 ; AVX512VLDQ-NEXT: pushq %r14
2510 ; AVX512VLDQ-NEXT: pushq %rbx
2511 ; AVX512VLDQ-NEXT: subq $24, %rsp
2512 ; AVX512VLDQ-NEXT: movq %rsi, %r14
2513 ; AVX512VLDQ-NEXT: movq %rdi, %rbx
2514 ; AVX512VLDQ-NEXT: movq %rdx, %rdi
2515 ; AVX512VLDQ-NEXT: movq %rcx, %rsi
2516 ; AVX512VLDQ-NEXT: callq __fixtfdi
2517 ; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
2518 ; AVX512VLDQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
2519 ; AVX512VLDQ-NEXT: movq %rbx, %rdi
2520 ; AVX512VLDQ-NEXT: movq %r14, %rsi
2521 ; AVX512VLDQ-NEXT: callq __fixtfdi
2522 ; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
2523 ; AVX512VLDQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2524 ; AVX512VLDQ-NEXT: # xmm0 = xmm0[0],mem[0]
2525 ; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2526 ; AVX512VLDQ-NEXT: addq $24, %rsp
2527 ; AVX512VLDQ-NEXT: popq %rbx
2528 ; AVX512VLDQ-NEXT: popq %r14
2529 ; AVX512VLDQ-NEXT: retq
2411 ; AVX-LABEL: fptosi_2f128_to_4i32:
2412 ; AVX: # BB#0:
2413 ; AVX-NEXT: pushq %r14
2414 ; AVX-NEXT: pushq %rbx
2415 ; AVX-NEXT: subq $24, %rsp
2416 ; AVX-NEXT: movq %rsi, %r14
2417 ; AVX-NEXT: movq %rdi, %rbx
2418 ; AVX-NEXT: movq %rdx, %rdi
2419 ; AVX-NEXT: movq %rcx, %rsi
2420 ; AVX-NEXT: callq __fixtfdi
2421 ; AVX-NEXT: vmovq %rax, %xmm0
2422 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
2423 ; AVX-NEXT: movq %rbx, %rdi
2424 ; AVX-NEXT: movq %r14, %rsi
2425 ; AVX-NEXT: callq __fixtfdi
2426 ; AVX-NEXT: vmovq %rax, %xmm0
2427 ; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
2428 ; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
2429 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2430 ; AVX-NEXT: addq $24, %rsp
2431 ; AVX-NEXT: popq %rbx
2432 ; AVX-NEXT: popq %r14
2433 ; AVX-NEXT: retq
25302434 %cvt = fptosi <2 x fp128> %a to <2 x i32>
25312435 %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32>
25322436 ret <4 x i32> %ext
287287 ;
288288 ; AVX512VL-LABEL: sitofp_4i64_to_4f64:
289289 ; AVX512VL: # BB#0:
290 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
290 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
291291 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
292292 ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
293293 ; AVX512VL-NEXT: vmovq %xmm1, %rax
298298 ; AVX512VL-NEXT: vmovq %xmm0, %rax
299299 ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
300300 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
301 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
301 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
302302 ; AVX512VL-NEXT: retq
303303 ;
304304 ; AVX512DQ-LABEL: sitofp_4i64_to_4f64:
820820 ;
821821 ; AVX512VL-LABEL: uitofp_4i64_to_4f64:
822822 ; AVX512VL: # BB#0:
823 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
823 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
824824 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
825825 ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
826826 ; AVX512VL-NEXT: vmovq %xmm1, %rax
831831 ; AVX512VL-NEXT: vmovq %xmm0, %rax
832832 ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
833833 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
834 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
834 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
835835 ; AVX512VL-NEXT: retq
836836 ;
837837 ; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
14291429 ; AVX512VL-NEXT: vmovq %xmm0, %rax
14301430 ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
14311431 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1432 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
1432 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
14331433 ; AVX512VL-NEXT: vmovq %xmm0, %rax
14341434 ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
14351435 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
23432343 ; AVX512VL-NEXT: vmovq %xmm0, %rax
23442344 ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
23452345 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2346 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
2346 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
23472347 ; AVX512VL-NEXT: vmovq %xmm0, %rax
23482348 ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
23492349 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
27742774 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
27752775 ; AVX512VL: # BB#0:
27762776 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
2777 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2777 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
27782778 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
27792779 ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
27802780 ; AVX512VL-NEXT: vmovq %xmm1, %rax
27852785 ; AVX512VL-NEXT: vmovq %xmm0, %rax
27862786 ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
27872787 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2788 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
2788 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
27892789 ; AVX512VL-NEXT: retq
27902790 ;
27912791 ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
31893189 ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
31903190 ; AVX512VL: # BB#0:
31913191 ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
3192 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
3192 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
31933193 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
31943194 ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
31953195 ; AVX512VL-NEXT: vmovq %xmm1, %rax
32003200 ; AVX512VL-NEXT: vmovq %xmm0, %rax
32013201 ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
32023202 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3203 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
3203 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
32043204 ; AVX512VL-NEXT: retq
32053205 ;
32063206 ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
34253425 ; AVX512VL-NEXT: vmovq %xmm0, %rax
34263426 ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
34273427 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
3428 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
3428 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
34293429 ; AVX512VL-NEXT: vmovq %xmm0, %rax
34303430 ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
34313431 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
36663666 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
36673667 ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
36683668 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
3669 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
3669 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
36703670 ; AVX512VL-NEXT: retq
36713671 ;
36723672 ; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
40124012 ; AVX512VL-NEXT: vmovq %xmm0, %rax
40134013 ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
40144014 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
4015 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
4015 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
40164016 ; AVX512VL-NEXT: vmovq %xmm0, %rax
40174017 ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
40184018 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
45924592 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
45934593 ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
45944594 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
4595 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
4595 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
45964596 ; AVX512VL-NEXT: retq
45974597 ;
45984598 ; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
460460 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
461461 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
462462 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
463 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0
463 ; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
464464 ; AVX512VL-NEXT: retq
465465 %1 = bitcast <8 x i16> %a0 to <8 x half>
466466 %2 = fpext <8 x half> %1 to <8 x float>
756756 ;
757757 ; AVX512VL-LABEL: cvt_16i16_to_16f32:
758758 ; AVX512VL: # BB#0:
759 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm10
759 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10
760760 ; AVX512VL-NEXT: vmovq %xmm0, %rax
761761 ; AVX512VL-NEXT: movq %rax, %rcx
762762 ; AVX512VL-NEXT: shrq $48, %rcx
839839 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3]
840840 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
841841 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
842 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
842 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
843843 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3]
844844 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
845845 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
846846 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
847847 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
848848 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
849 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm2, %ymm1
849 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
850850 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
851851 ; AVX512VL-NEXT: retq
852852 %1 = bitcast <16 x i16> %a0 to <16 x half>
12261226 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
12271227 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
12281228 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1229 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0
1229 ; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
12301230 ; AVX512VL-NEXT: retq
12311231 %1 = load <8 x i16>, <8 x i16>* %a0
12321232 %2 = bitcast <8 x i16> %1 to <8 x half>
14901490 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
14911491 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
14921492 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1493 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0
1493 ; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
14941494 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
14951495 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
14961496 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
14971497 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
14981498 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
14991499 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
1500 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm2, %ymm1
1500 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
15011501 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
15021502 ; AVX512VL-NEXT: retq
15031503 %1 = load <16 x i16>, <16 x i16>* %a0
17371737 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
17381738 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
17391739 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1740 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
1740 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
17411741 ; AVX512VL-NEXT: retq
17421742 %1 = bitcast <4 x i16> %a0 to <4 x half>
17431743 %2 = fpext <4 x half> %1 to <4 x double>
19281928 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
19291929 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
19301930 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1931 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
1931 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
19321932 ; AVX512VL-NEXT: retq
19331933 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32>
19341934 %2 = bitcast <4 x i16> %1 to <4 x half>
21442144 ; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
21452145 ; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
21462146 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
2147 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm6, %ymm4, %ymm4
2147 ; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
21482148 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
21492149 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
21502150 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
21512151 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
21522152 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
21532153 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2154 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
2154 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
21552155 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
21562156 ; AVX512VL-NEXT: retq
21572157 %1 = bitcast <8 x i16> %a0 to <8 x half>
23492349 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
23502350 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
23512351 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2352 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
2352 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
23532353 ; AVX512VL-NEXT: retq
23542354 %1 = load <4 x i16>, <4 x i16>* %a0
23552355 %2 = bitcast <4 x i16> %1 to <4 x half>
24732473 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
24742474 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
24752475 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2476 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
2476 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
24772477 ; AVX512VL-NEXT: retq
24782478 %1 = load <8 x i16>, <8 x i16>* %a0
24792479 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32>
26422642 ; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
26432643 ; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
26442644 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
2645 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm6, %ymm4, %ymm4
2645 ; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
26462646 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
26472647 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
26482648 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
26492649 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
26502650 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
26512651 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2652 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
2652 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
26532653 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
26542654 ; AVX512VL-NEXT: retq
26552655 %1 = load <8 x i16>, <8 x i16>* %a0
31813181 ; AVX512VL-NEXT: orl %edx, %eax
31823182 ; AVX512VL-NEXT: shlq $32, %rax
31833183 ; AVX512VL-NEXT: orq %rcx, %rax
3184 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
3184 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
31853185 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
31863186 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
31873187 ; AVX512VL-NEXT: vmovd %xmm1, %ecx
34263426 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
34273427 ; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
34283428 ; AVX512VL-NEXT: vmovd %xmm2, %eax
3429 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm1, %xmm2
3429 ; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2
34303430 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
34313431 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
34323432 ; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
34573457 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
34583458 ; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
34593459 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3460 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
3460 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
34613461 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
34623462 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
34633463 ; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
34783478 ; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
34793479 ; AVX512VL-NEXT: vmovd %xmm0, %eax
34803480 ; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
3481 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0
3481 ; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
34823482 ; AVX512VL-NEXT: retq
34833483 %1 = fptrunc <16 x float> %a0 to <16 x half>
34843484 %2 = bitcast <16 x half> %1 to <16 x i16>
39573957 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
39583958 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
39593959 ; AVX512VL-NEXT: vmovd %xmm1, %r10d
3960 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
3960 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
39613961 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
39623962 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
39633963 ; AVX512VL-NEXT: vmovd %xmm2, %r11d
41904190 ;
41914191 ; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
41924192 ; AVX512VL: # BB#0:
4193 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
4193 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
41944194 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4195 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm2, %xmm3
4195 ; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3
41964196 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4
41974197 ; AVX512VL-NEXT: vmovd %xmm4, %eax
41984198 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4
44214421 ; AVX512VL-NEXT: movzwl %ax, %r14d
44224422 ; AVX512VL-NEXT: orl %ebx, %r14d
44234423 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
4424 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
4424 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
44254425 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
44264426 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
44274427 ; AVX512VL-NEXT: callq __truncdfhf2
45714571 ; AVX512VL-NEXT: movzwl %ax, %r14d
45724572 ; AVX512VL-NEXT: orl %ebx, %r14d
45734573 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
4574 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
4574 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
45754575 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
45764576 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
45774577 ; AVX512VL-NEXT: callq __truncdfhf2
47254725 ; AVX512VL-NEXT: movzwl %ax, %r14d
47264726 ; AVX512VL-NEXT: orl %ebx, %r14d
47274727 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
4728 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
4728 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
47294729 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
47304730 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
47314731 ; AVX512VL-NEXT: callq __truncdfhf2
49684968 ; AVX512VL-NEXT: movzwl %ax, %r15d
49694969 ; AVX512VL-NEXT: orl %ebx, %r15d
49704970 ; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
4971 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
4971 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
49724972 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
49734973 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
49744974 ; AVX512VL-NEXT: callq __truncdfhf2
49934993 ; AVX512VL-NEXT: movzwl %ax, %r15d
49944994 ; AVX512VL-NEXT: orl %ebx, %r15d
49954995 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
4996 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
4996 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
49974997 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
49984998 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
49994999 ; AVX512VL-NEXT: callq __truncdfhf2
51875187 ; AVX512VL-NEXT: callq __truncdfhf2
51885188 ; AVX512VL-NEXT: movl %eax, %r14d
51895189 ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5190 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
5190 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
51915191 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
51925192 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
51935193 ; AVX512VL-NEXT: callq __truncdfhf2
53565356 ; AVX512VL-NEXT: movzwl %ax, %ebx
53575357 ; AVX512VL-NEXT: orl %ebp, %ebx
53585358 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
5359 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
5359 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
53605360 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
53615361 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
53625362 ; AVX512VL-NEXT: callq __truncdfhf2
55275527 ; AVX512VL-NEXT: movzwl %ax, %ebx
55285528 ; AVX512VL-NEXT: orl %ebp, %ebx
55295529 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
5530 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
5530 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
55315531 ; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
55325532 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
55335533 ; AVX512VL-NEXT: callq __truncdfhf2
57745774 ; AVX512VL-NEXT: callq __truncdfhf2
57755775 ; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
57765776 ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
5777 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
5777 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
57785778 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
57795779 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
57805780 ; AVX512VL-NEXT: callq __truncdfhf2
57865786 ; AVX512VL-NEXT: callq __truncdfhf2
57875787 ; AVX512VL-NEXT: movl %eax, %r12d
57885788 ; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
5789 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
5789 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
57905790 ; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
57915791 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
57925792 ; AVX512VL-NEXT: callq __truncdfhf2
709709 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
710710 ; AVX2-NEXT: retq
711711 ;
712 ; AVX512VLCD-LABEL: testv32i8:
713 ; AVX512VLCD: ## BB#0:
714 ; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1
715 ; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
716 ; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1
717 ; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1
718 ; AVX512VLCD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
719 ; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
720 ; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
721 ; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
722 ; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
723 ; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
724 ; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
725 ; AVX512VLCD-NEXT: retq
726 ;
727 ; AVX512CD-LABEL: testv32i8:
728 ; AVX512CD: ## BB#0:
729 ; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm1
730 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
731 ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
732 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
733 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
734 ; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
735 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
736 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
737 ; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
738 ; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
739 ; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
740 ; AVX512CD-NEXT: retq
712 ; AVX512-LABEL: testv32i8:
713 ; AVX512: ## BB#0:
714 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
715 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
716 ; AVX512-NEXT: vplzcntd %zmm1, %zmm1
717 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1
718 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
719 ; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1
720 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
721 ; AVX512-NEXT: vplzcntd %zmm0, %zmm0
722 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
723 ; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0
724 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
725 ; AVX512-NEXT: retq
741726 ;
742727 ; X32-AVX-LABEL: testv32i8:
743728 ; X32-AVX: # BB#0:
798783 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
799784 ; AVX2-NEXT: retq
800785 ;
801 ; AVX512VLCD-LABEL: testv32i8u:
802 ; AVX512VLCD: ## BB#0:
803 ; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1
804 ; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
805 ; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1
806 ; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1
807 ; AVX512VLCD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
808 ; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
809 ; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
810 ; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
811 ; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
812 ; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
813 ; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
814 ; AVX512VLCD-NEXT: retq
815 ;
816 ; AVX512CD-LABEL: testv32i8u:
817 ; AVX512CD: ## BB#0:
818 ; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm1
819 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
820 ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
821 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
822 ; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
823 ; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
824 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
825 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
826 ; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
827 ; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
828 ; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
829 ; AVX512CD-NEXT: retq
786 ; AVX512-LABEL: testv32i8u:
787 ; AVX512: ## BB#0:
788 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
789 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
790 ; AVX512-NEXT: vplzcntd %zmm1, %zmm1
791 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1
792 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
793 ; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1
794 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
795 ; AVX512-NEXT: vplzcntd %zmm0, %zmm0
796 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
797 ; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0
798 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
799 ; AVX512-NEXT: retq
830800 ;
831801 ; X32-AVX-LABEL: testv32i8u:
832802 ; X32-AVX: # BB#0:
17881788 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
17891789 ; AVX1-NEXT: retq
17901790 ;
1791 ; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
1792 ; AVX2: # BB#0:
1793 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1794 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1795 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
1796 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
1797 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1798 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1799 ; AVX2-NEXT: retq
1800 ;
1801 ; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
1802 ; AVX512VL: # BB#0:
1803 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
1804 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1805 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
1806 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
1807 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1808 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1809 ; AVX512VL-NEXT: retq
1791 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
1792 ; AVX2OR512VL: # BB#0:
1793 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1794 ; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1795 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
1796 ; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
1797 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1798 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1799 ; AVX2OR512VL-NEXT: retq
18101800 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
18111801 ret <16 x i16> %shuffle
18121802 }
18211811 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
18221812 ; AVX1-NEXT: retq
18231813 ;
1824 ; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
1825 ; AVX2: # BB#0:
1826 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1827 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1828 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
1829 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
1830 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1831 ; AVX2-NEXT: retq
1832 ;
1833 ; AVX512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
1834 ; AVX512VL: # BB#0:
1835 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
1836 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1837 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
1838 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
1839 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1840 ; AVX512VL-NEXT: retq
1814 ; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
1815 ; AVX2OR512VL: # BB#0:
1816 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1817 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1818 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
1819 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
1820 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1821 ; AVX2OR512VL-NEXT: retq
18411822 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
18421823 ret <16 x i16> %shuffle
18431824 }
18841865 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
18851866 ; AVX1-NEXT: retq
18861867 ;
1887 ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
1888 ; AVX2: # BB#0:
1889 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1890 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1891 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
1892 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
1893 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1894 ; AVX2-NEXT: retq
1895 ;
1896 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
1897 ; AVX512VL: # BB#0:
1898 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
1899 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1900 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
1901 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
1902 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1903 ; AVX512VL-NEXT: retq
1868 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
1869 ; AVX2OR512VL: # BB#0:
1870 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1871 ; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1872 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
1873 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm1
1874 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1875 ; AVX2OR512VL-NEXT: retq
19041876 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
19051877 ret <16 x i16> %shuffle
19061878 }
19181890 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
19191891 ; AVX1-NEXT: retq
19201892 ;
1921 ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
1922 ; AVX2: # BB#0:
1923 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1924 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
1925 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1926 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
1927 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
1928 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1929 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1930 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1931 ; AVX2-NEXT: retq
1932 ;
1933 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
1934 ; AVX512VL: # BB#0:
1935 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
1936 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
1937 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1938 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
1939 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
1940 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1941 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1942 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1943 ; AVX512VL-NEXT: retq
1893 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
1894 ; AVX2OR512VL: # BB#0:
1895 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1896 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
1897 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1898 ; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
1899 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
1900 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1901 ; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
1902 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1903 ; AVX2OR512VL-NEXT: retq
19441904 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
19451905 ret <16 x i16> %shuffle
19461906 }
19561916 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
19571917 ; AVX1-NEXT: retq
19581918 ;
1959 ; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
1960 ; AVX2: # BB#0:
1961 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1962 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1963 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1964 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
1965 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
1966 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1967 ; AVX2-NEXT: retq
1968 ;
1969 ; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
1970 ; AVX512VL: # BB#0:
1971 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
1972 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1973 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1974 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
1975 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
1976 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0
1977 ; AVX512VL-NEXT: retq
1919 ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
1920 ; AVX2OR512VL: # BB#0:
1921 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1922 ; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1923 ; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1924 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
1925 ; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
1926 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1927 ; AVX2OR512VL-NEXT: retq
19781928 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
19791929 ret <16 x i16> %shuffle
19801930 }
19901940 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
19911941 ; AVX1-NEXT: retq
19921942 ;
1993 ; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
1994 ; AVX2: # BB#0:
1995 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1996 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1997 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1998 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
1999 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
2000 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2001 ; AVX2-NEXT: retq
2002 ;
2003 ; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
2004 ; AVX512VL: # BB#0:
2005 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2006 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2007 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2008 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
2009 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
2010 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0
2011 ; AVX512VL-NEXT: retq
1943 ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
1944 ; AVX2OR512VL: # BB#0:
1945 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1946 ; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1947 ; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1948 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
1949 ; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
1950 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1951 ; AVX2OR512VL-NEXT: retq
20121952 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
20131953 ret <16 x i16> %shuffle
20141954 }
20251965 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
20261966 ; AVX1-NEXT: retq
20271967 ;
2028 ; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
2029 ; AVX2: # BB#0:
2030 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2031 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2032 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
2033 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
2034 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
2035 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2036 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2037 ; AVX2-NEXT: retq
2038 ;
2039 ; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
2040 ; AVX512VL: # BB#0:
2041 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2042 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2043 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
2044 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
2045 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
2046 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2047 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2048 ; AVX512VL-NEXT: retq
1968 ; AVX2OR512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
1969 ; AVX2OR512VL: # BB#0:
1970 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1971 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
1972 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
1973 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
1974 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
1975 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
1976 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1977 ; AVX2OR512VL-NEXT: retq
20491978 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
20501979 ret <16 x i16> %shuffle
20511980 }
20611990 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
20621991 ; AVX1-NEXT: retq
20631992 ;
2064 ; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
2065 ; AVX2: # BB#0:
2066 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2067 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
2068 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
2069 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2070 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
2071 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2072 ; AVX2-NEXT: retq
2073 ;
2074 ; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
2075 ; AVX512VL: # BB#0:
2076 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2077 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2078 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
2079 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2080 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
2081 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2082 ; AVX512VL-NEXT: retq
1993 ; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
1994 ; AVX2OR512VL: # BB#0:
1995 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
1996 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
1997 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
1998 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
1999 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
2000 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2001 ; AVX2OR512VL-NEXT: retq
20832002 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
20842003 ret <16 x i16> %shuffle
20852004 }
20942013 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
20952014 ; AVX1-NEXT: retq
20962015 ;
2097 ; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
2098 ; AVX2: # BB#0:
2099 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2100 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2101 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
2102 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
2103 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2104 ; AVX2-NEXT: retq
2105 ;
2106 ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
2107 ; AVX512VL: # BB#0:
2108 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2109 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2110 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
2111 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
2112 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2113 ; AVX512VL-NEXT: retq
2016 ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
2017 ; AVX2OR512VL: # BB#0:
2018 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2019 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2020 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
2021 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
2022 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2023 ; AVX2OR512VL-NEXT: retq
21142024 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
21152025 ret <16 x i16> %shuffle
21162026 }
21272037 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
21282038 ; AVX1-NEXT: retq
21292039 ;
2130 ; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
2131 ; AVX2: # BB#0:
2132 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2133 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2134 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
2135 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
2136 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
2137 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2138 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2139 ; AVX2-NEXT: retq
2140 ;
2141 ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
2142 ; AVX512VL: # BB#0:
2143 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2144 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2145 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
2146 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
2147 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
2148 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2149 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2150 ; AVX512VL-NEXT: retq
2040 ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
2041 ; AVX2OR512VL: # BB#0:
2042 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2043 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
2044 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
2045 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
2046 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
2047 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
2048 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2049 ; AVX2OR512VL-NEXT: retq
21512050 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
21522051 ret <16 x i16> %shuffle
21532052 }
21632062 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
21642063 ; AVX1-NEXT: retq
21652064 ;
2166 ; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
2167 ; AVX2: # BB#0:
2168 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2169 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
2170 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
2171 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
2172 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
2173 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2174 ; AVX2-NEXT: retq
2175 ;
2176 ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
2177 ; AVX512VL: # BB#0:
2178 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2179 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
2180 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
2181 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
2182 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
2183 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2184 ; AVX512VL-NEXT: retq
2065 ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
2066 ; AVX2OR512VL: # BB#0:
2067 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2068 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
2069 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
2070 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
2071 ; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
2072 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2073 ; AVX2OR512VL-NEXT: retq
21852074 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
21862075 ret <16 x i16> %shuffle
21872076 }
22092098 ;
22102099 ; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
22112100 ; AVX512VL: # BB#0:
2212 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2101 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
22132102 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
22142103 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
22152104 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
22162105 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2217 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
2106 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
22182107 ; AVX512VL-NEXT: retq
22192108 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
22202109 ret <16 x i16> %shuffle
22312120 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
22322121 ; AVX1-NEXT: retq
22332122 ;
2234 ; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
2235 ; AVX2: # BB#0:
2236 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2237 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
2238 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
2239 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2240 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
2241 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2242 ; AVX2-NEXT: retq
2243 ;
2244 ; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
2245 ; AVX512VL: # BB#0:
2246 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2247 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2248 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
2249 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2250 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
2251 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2252 ; AVX512VL-NEXT: retq
2123 ; AVX2OR512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
2124 ; AVX2OR512VL: # BB#0:
2125 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2126 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2127 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
2128 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2129 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
2130 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2131 ; AVX2OR512VL-NEXT: retq
22532132 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
22542133 ret <16 x i16> %shuffle
22552134 }
22652144 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
22662145 ; AVX1-NEXT: retq
22672146 ;
2268 ; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
2269 ; AVX2: # BB#0:
2270 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2271 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
2272 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
2273 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2274 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
2275 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2276 ; AVX2-NEXT: retq
2277 ;
2278 ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
2279 ; AVX512VL: # BB#0:
2280 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2281 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2282 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
2283 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2284 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
2285 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2286 ; AVX512VL-NEXT: retq
2147 ; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
2148 ; AVX2OR512VL: # BB#0:
2149 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2150 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2151 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
2152 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2153 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
2154 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2155 ; AVX2OR512VL-NEXT: retq
22872156 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
22882157 ret <16 x i16> %shuffle
22892158 }
22992168 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
23002169 ; AVX1-NEXT: retq
23012170 ;
2302 ; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
2303 ; AVX2: # BB#0:
2304 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2305 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
2306 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
2307 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2308 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
2309 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2310 ; AVX2-NEXT: retq
2311 ;
2312 ; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
2313 ; AVX512VL: # BB#0:
2314 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2315 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2316 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
2317 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2318 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
2319 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2320 ; AVX512VL-NEXT: retq
2171 ; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
2172 ; AVX2OR512VL: # BB#0:
2173 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2174 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2175 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
2176 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2177 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
2178 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2179 ; AVX2OR512VL-NEXT: retq
23212180 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
23222181 ret <16 x i16> %shuffle
23232182 }
23332192 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
23342193 ; AVX1-NEXT: retq
23352194 ;
2336 ; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
2337 ; AVX2: # BB#0:
2338 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2339 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
2340 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
2341 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2342 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
2343 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2344 ; AVX2-NEXT: retq
2345 ;
2346 ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
2347 ; AVX512VL: # BB#0:
2348 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2349 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2350 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
2351 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2352 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
2353 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2354 ; AVX512VL-NEXT: retq
2195 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
2196 ; AVX2OR512VL: # BB#0:
2197 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2198 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2199 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
2200 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2201 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
2202 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2203 ; AVX2OR512VL-NEXT: retq
23552204 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
23562205 ret <16 x i16> %shuffle
23572206 }
23672216 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
23682217 ; AVX1-NEXT: retq
23692218 ;
2370 ; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
2371 ; AVX2: # BB#0:
2372 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2373 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
2374 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
2375 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2376 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
2377 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2378 ; AVX2-NEXT: retq
2379 ;
2380 ; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
2381 ; AVX512VL: # BB#0:
2382 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2383 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2384 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
2385 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2386 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
2387 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2388 ; AVX512VL-NEXT: retq
2219 ; AVX2OR512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
2220 ; AVX2OR512VL: # BB#0:
2221 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2222 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2223 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
2224 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2225 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
2226 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2227 ; AVX2OR512VL-NEXT: retq
23892228 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
23902229 ret <16 x i16> %shuffle
23912230 }
24132252 ;
24142253 ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
24152254 ; AVX512VL: # BB#0:
2416 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2255 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
24172256 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
24182257 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
24192258 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
24202259 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2421 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
2260 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
24222261 ; AVX512VL-NEXT: retq
24232262 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
24242263 ret <16 x i16> %shuffle
24472286 ;
24482287 ; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
24492288 ; AVX512VL: # BB#0:
2450 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2289 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
24512290 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
24522291 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
24532292 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
24542293 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2455 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
2294 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
24562295 ; AVX512VL-NEXT: retq
24572296 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
24582297 ret <16 x i16> %shuffle
24812320 ;
24822321 ; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
24832322 ; AVX512VL: # BB#0:
2484 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2323 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
24852324 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
24862325 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
24872326 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
24882327 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2489 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
2328 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
24902329 ; AVX512VL-NEXT: retq
24912330 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
24922331 ret <16 x i16> %shuffle
25152354 ;
25162355 ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
25172356 ; AVX512VL: # BB#0:
2518 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2357 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
25192358 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
25202359 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
25212360 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
25222361 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2523 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
2362 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
25242363 ; AVX512VL-NEXT: retq
25252364 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
25262365 ret <16 x i16> %shuffle
25372376 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
25382377 ; AVX1-NEXT: retq
25392378 ;
2540 ; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
2541 ; AVX2: # BB#0:
2542 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2543 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
2544 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
2545 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2546 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
2547 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2548 ; AVX2-NEXT: retq
2549 ;
2550 ; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
2551 ; AVX512VL: # BB#0:
2552 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2553 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2554 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
2555 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2556 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
2557 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2558 ; AVX512VL-NEXT: retq
2379 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
2380 ; AVX2OR512VL: # BB#0:
2381 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2382 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2383 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
2384 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2385 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
2386 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2387 ; AVX2OR512VL-NEXT: retq
25592388 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
25602389 ret <16 x i16> %shuffle
25612390 }
25712400 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
25722401 ; AVX1-NEXT: retq
25732402 ;
2574 ; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
2575 ; AVX2: # BB#0:
2576 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2577 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
2578 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
2579 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2580 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
2581 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2582 ; AVX2-NEXT: retq
2583 ;
2584 ; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
2585 ; AVX512VL: # BB#0:
2586 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2587 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2588 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
2589 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2590 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
2591 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2592 ; AVX512VL-NEXT: retq
2403 ; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
2404 ; AVX2OR512VL: # BB#0:
2405 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2406 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2407 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
2408 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2409 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
2410 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2411 ; AVX2OR512VL-NEXT: retq
25932412 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
25942413 ret <16 x i16> %shuffle
25952414 }
26052424 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
26062425 ; AVX1-NEXT: retq
26072426 ;
2608 ; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
2609 ; AVX2: # BB#0:
2610 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2611 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
2612 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2613 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2614 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2615 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2616 ; AVX2-NEXT: retq
2617 ;
2618 ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
2619 ; AVX512VL: # BB#0:
2620 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2621 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2622 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2623 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2624 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2625 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2626 ; AVX512VL-NEXT: retq
2427 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
2428 ; AVX2OR512VL: # BB#0:
2429 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2430 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2431 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2432 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2433 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2434 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2435 ; AVX2OR512VL-NEXT: retq
26272436 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
26282437 ret <16 x i16> %shuffle
26292438 }
26392448 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
26402449 ; AVX1-NEXT: retq
26412450 ;
2642 ; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
2643 ; AVX2: # BB#0:
2644 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2645 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
2646 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
2647 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2648 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
2649 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2650 ; AVX2-NEXT: retq
2651 ;
2652 ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
2653 ; AVX512VL: # BB#0:
2654 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2655 ; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2656 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
2657 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2658 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
2659 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2660 ; AVX512VL-NEXT: retq
2451 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
2452 ; AVX2OR512VL: # BB#0:
2453 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2454 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
2455 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
2456 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2457 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
2458 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2459 ; AVX2OR512VL-NEXT: retq
26612460 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
26622461 ret <16 x i16> %shuffle
26632462 }
26742473 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
26752474 ; AVX1-NEXT: retq
26762475 ;
2677 ; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
2678 ; AVX2: # BB#0:
2679 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2680 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2681 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
2682 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
2683 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2684 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
2685 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2686 ; AVX2-NEXT: retq
2687 ;
2688 ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
2689 ; AVX512VL: # BB#0:
2690 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2691 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2692 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
2693 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
2694 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2695 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
2696 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2697 ; AVX512VL-NEXT: retq
2476 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
2477 ; AVX2OR512VL: # BB#0:
2478 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2479 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2480 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
2481 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
2482 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2483 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
2484 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2485 ; AVX2OR512VL-NEXT: retq
26982486 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
26992487 ret <16 x i16> %shuffle
27002488 }
27102498 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
27112499 ; AVX1-NEXT: retq
27122500 ;
2713 ; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
2714 ; AVX2: # BB#0:
2715 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2716 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
2717 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
2718 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2719 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
2720 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2721 ; AVX2-NEXT: retq
2722 ;
2723 ; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
2724 ; AVX512VL: # BB#0:
2725 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2726 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2727 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
2728 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2729 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
2730 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2731 ; AVX512VL-NEXT: retq
2501 ; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
2502 ; AVX2OR512VL: # BB#0:
2503 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2504 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2505 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
2506 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2507 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
2508 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2509 ; AVX2OR512VL-NEXT: retq
27322510 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
27332511 ret <16 x i16> %shuffle
27342512 }
27442522 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
27452523 ; AVX1-NEXT: retq
27462524 ;
2747 ; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
2748 ; AVX2: # BB#0:
2749 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2750 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
2751 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2752 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2753 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2754 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2755 ; AVX2-NEXT: retq
2756 ;
2757 ; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
2758 ; AVX512VL: # BB#0:
2759 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2760 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2761 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2762 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2763 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2764 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2765 ; AVX512VL-NEXT: retq
2525 ; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
2526 ; AVX2OR512VL: # BB#0:
2527 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2528 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2529 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2530 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2531 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2532 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2533 ; AVX2OR512VL-NEXT: retq
27662534 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
27672535 ret <16 x i16> %shuffle
27682536 }
27782546 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
27792547 ; AVX1-NEXT: retq
27802548 ;
2781 ; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
2782 ; AVX2: # BB#0:
2783 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2784 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
2785 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2786 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2787 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2788 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2789 ; AVX2-NEXT: retq
2790 ;
2791 ; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
2792 ; AVX512VL: # BB#0:
2793 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2794 ; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2795 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2796 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2797 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2798 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2799 ; AVX512VL-NEXT: retq
2549 ; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
2550 ; AVX2OR512VL: # BB#0:
2551 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
2552 ; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
2553 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
2554 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2555 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
2556 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
2557 ; AVX2OR512VL-NEXT: retq
28002558 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
28012559 ret <16 x i16> %shuffle
28022560 }
28472605 ;
28482606 ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
28492607 ; AVX512VL: # BB#0:
2850 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2608 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
28512609 ; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm2
28522610 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
28532611 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
28542612 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
28552613 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2856 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2614 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
28572615 ; AVX512VL-NEXT: retq
28582616 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
28592617 ret <16 x i16> %shuffle
29252683 ;
29262684 ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
29272685 ; AVX512VL: # BB#0:
2928 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2686 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
29292687 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
29302688 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
29312689 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
29332691 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
29342692 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
29352693 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,1,2]
2936 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2694 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
29372695 ; AVX512VL-NEXT: retq
29382696 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
29392697 ret <16 x i16> %shuffle
29602718 ;
29612719 ; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
29622720 ; AVX512VL: # BB#0:
2963 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2721 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
29642722 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
29652723 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
29662724 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
29672725 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
29682726 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,3,1]
2969 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
2727 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
29702728 ; AVX512VL-NEXT: retq
29712729 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
29722730 ret <16 x i16> %shuffle
29952753 ;
29962754 ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
29972755 ; AVX512VL: # BB#0:
2998 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
2756 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
29992757 ; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
30002758 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
30012759 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
30022760 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
3003 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
2761 ; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
30042762 ; AVX512VL-NEXT: retq
30052763 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
30062764 ret <16 x i16> %shuffle
36923450 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
36933451 ; AVX1-NEXT: retq
36943452 ;
3695 ; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
3696 ; AVX2: # BB#0:
3697 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3698 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
3699 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3700 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3701 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3702 ; AVX2-NEXT: retq
3703 ;
3704 ; AVX512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
3705 ; AVX512VL: # BB#0:
3706 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
3707 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
3708 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3709 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3710 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
3711 ; AVX512VL-NEXT: retq
3453 ; AVX2OR512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
3454 ; AVX2OR512VL: # BB#0:
3455 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
3456 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
3457 ; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3458 ; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
3459 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3460 ; AVX2OR512VL-NEXT: retq
37123461 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
37133462 ret <16 x i16> %shuffle
37143463 }
38083557 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
38093558 ; AVX1-NEXT: retq
38103559 ;
3811 ; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
3812 ; AVX2: # BB#0:
3813 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3814 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
3815 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
3816 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
3817 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3818 ; AVX2-NEXT: retq
3819 ;
3820 ; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
3821 ; AVX512VL: # BB#0:
3822 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
3823 ; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
3824 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
3825 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
3826 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
3827 ; AVX512VL-NEXT: retq
3560 ; AVX2OR512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
3561 ; AVX2OR512VL: # BB#0:
3562 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
3563 ; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
3564 ; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
3565 ; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
3566 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3567 ; AVX2OR512VL-NEXT: retq
38283568 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
38293569 ret <16 x i16> %shuffle
38303570 }
39993739 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
40003740 ; AVX1-NEXT: retq
40013741 ;
4002 ; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
4003 ; AVX2: # BB#0:
4004 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4005 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4006 ; AVX2-NEXT: retq
4007 ;
4008 ; AVX512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
4009 ; AVX512VL: # BB#0:
4010 ; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4011 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
4012 ; AVX512VL-NEXT: retq
3742 ; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
3743 ; AVX2OR512VL: # BB#0:
3744 ; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3745 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3746 ; AVX2OR512VL-NEXT: retq
40133747 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
40143748 ret <16 x i16> %shuffle
40153749 }
40223756 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
40233757 ; AVX1-NEXT: retq
40243758 ;
4025 ; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
4026 ; AVX2: # BB#0:
4027 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
4028 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4029 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4030 ; AVX2-NEXT: retq
4031 ;
4032 ; AVX512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
4033 ; AVX512VL: # BB#0:
4034 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
4035 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4036 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
4037 ; AVX512VL-NEXT: retq
3759 ; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
3760 ; AVX2OR512VL: # BB#0:
3761 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
3762 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3763 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3764 ; AVX2OR512VL-NEXT: retq
40383765 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
40393766 ret <16 x i16> %shuffle
40403767 }
40483775 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
40493776 ; AVX1-NEXT: retq
40503777 ;
4051 ; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
4052 ; AVX2: # BB#0:
4053 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
4054 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
4055 ; AVX2-NEXT: retq
4056 ;
4057 ; AVX512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
4058 ; AVX512VL: # BB#0:
4059 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
4060 ; AVX512VL-NEXT: vpbroadcastw %xmm0, %ymm0
4061 ; AVX512VL-NEXT: retq
3778 ; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
3779 ; AVX2OR512VL: # BB#0:
3780 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
3781 ; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0
3782 ; AVX2OR512VL-NEXT: retq
40623783 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
40633784 ret <16 x i16> %shuffle
40643785 }
40903811 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
40913812 ; AVX1-NEXT: retq
40923813 ;
4093 ; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
4094 ; AVX2: # BB#0:
4095 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
4096 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
4097 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4098 ; AVX2-NEXT: retq
4099 ;
4100 ; AVX512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
4101 ; AVX512VL: # BB#0:
4102 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
4103 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
4104 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
4105 ; AVX512VL-NEXT: retq
3814 ; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
3815 ; AVX2OR512VL: # BB#0:
3816 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
3817 ; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
3818 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
3819 ; AVX2OR512VL-NEXT: retq
41063820 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
41073821 ret <16 x i16> %shuffle
41083822 }
16921692 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
16931693 ; AVX1-NEXT: retq
16941694 ;
1695 ; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
1696 ; AVX2: # BB#0:
1697 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1698 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
1699 ; AVX2-NEXT: retq
1700 ;
1701 ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
1702 ; AVX512VL: # BB#0:
1703 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1704 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
1705 ; AVX512VL-NEXT: retq
1695 ; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
1696 ; AVX2OR512VL: # BB#0:
1697 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1698 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
1699 ; AVX2OR512VL-NEXT: retq
17061700 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32>
17071701 ret <32 x i8> %shuffle
17081702 }
17731767 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
17741768 ; AVX1-NEXT: retq
17751769 ;
1776 ; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
1777 ; AVX2: # BB#0:
1778 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1779 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1780 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1781 ; AVX2-NEXT: retq
1782 ;
1783 ; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
1784 ; AVX512VL: # BB#0:
1785 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1786 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1787 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0
1788 ; AVX512VL-NEXT: retq
1770 ; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
1771 ; AVX2OR512VL: # BB#0:
1772 ; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1773 ; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1774 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1775 ; AVX2OR512VL-NEXT: retq
17891776 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32>
17901777 ret <32 x i8> %shuffle
17911778 }
21742161 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
21752162 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
21762163 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
2177 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
2164 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
21782165 ; AVX512VL-NEXT: retq
21792166 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32>
21802167 ret <32 x i8> %shuffle
21892176 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
21902177 ; AVX1-NEXT: retq
21912178 ;
2192 ; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
2193 ; AVX2: # BB#0:
2194 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2195 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
2196 ; AVX2-NEXT: retq
2197 ;
2198 ; AVX512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
2199 ; AVX512VL: # BB#0:
2200 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
2201 ; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
2202 ; AVX512VL-NEXT: retq
2179 ; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
2180 ; AVX2OR512VL: # BB#0:
2181 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
2182 ; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0
2183 ; AVX2OR512VL-NEXT: retq
22032184 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32>
22042185 ret <32 x i8> %shuffle
22052186 }
22632244 ;
22642245 ; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
22652246 ; AVX512VL: # BB#0:
2266 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
2247 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
22672248 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
22682249 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
22692250 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
319319 }
320320
321321 define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
322 ; AVX1-LABEL: shuffle_v4f64_0145:
323 ; AVX1: # BB#0:
324 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
325 ; AVX1-NEXT: retq
326 ;
327 ; AVX2-LABEL: shuffle_v4f64_0145:
328 ; AVX2: # BB#0:
329 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
330 ; AVX2-NEXT: retq
331 ;
332 ; AVX512VL-LABEL: shuffle_v4f64_0145:
333 ; AVX512VL: # BB#0:
334 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
335 ; AVX512VL-NEXT: retq
322 ; ALL-LABEL: shuffle_v4f64_0145:
323 ; ALL: # BB#0:
324 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
325 ; ALL-NEXT: retq
336326 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
337327 ret <4 x double> %shuffle
338328 }
339329
340330 define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
341 ; AVX1-LABEL: shuffle_v4f64_4501:
342 ; AVX1: # BB#0:
343 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
344 ; AVX1-NEXT: retq
345 ;
346 ; AVX2-LABEL: shuffle_v4f64_4501:
347 ; AVX2: # BB#0:
348 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
349 ; AVX2-NEXT: retq
350 ;
351 ; AVX512VL-LABEL: shuffle_v4f64_4501:
352 ; AVX512VL: # BB#0:
353 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm1, %ymm0
354 ; AVX512VL-NEXT: retq
331 ; ALL-LABEL: shuffle_v4f64_4501:
332 ; ALL: # BB#0:
333 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
334 ; ALL-NEXT: retq
355335 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
356336 ret <4 x double> %shuffle
357337 }
366346 }
367347
368348 define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
369 ; AVX1-LABEL: shuffle_v4f64_1054:
370 ; AVX1: # BB#0:
371 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
372 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
373 ; AVX1-NEXT: retq
374 ;
375 ; AVX2-LABEL: shuffle_v4f64_1054:
376 ; AVX2: # BB#0:
377 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
378 ; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
379 ; AVX2-NEXT: retq
380 ;
381 ; AVX512VL-LABEL: shuffle_v4f64_1054:
382 ; AVX512VL: # BB#0:
383 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
384 ; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
385 ; AVX512VL-NEXT: retq
349 ; ALL-LABEL: shuffle_v4f64_1054:
350 ; ALL: # BB#0:
351 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
352 ; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
353 ; ALL-NEXT: retq
386354 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
387355 ret <4 x double> %shuffle
388356 }
734702 ;
735703 ; AVX512VL-LABEL: shuffle_v4i64_0142:
736704 ; AVX512VL: # BB#0:
737 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1
705 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
738706 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
739707 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
740708 ; AVX512VL-NEXT: retq
807775 ;
808776 ; AVX512VL-LABEL: shuffle_v4i64_0145:
809777 ; AVX512VL: # BB#0:
810 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
778 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
811779 ; AVX512VL-NEXT: retq
812780 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32>
813781 ret <4 x i64> %shuffle
851819 ;
852820 ; AVX512VL-LABEL: shuffle_v4i64_4501:
853821 ; AVX512VL: # BB#0:
854 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0
822 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
855823 ; AVX512VL-NEXT: retq
856824 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32>
857825 ret <4 x i64> %shuffle
947915 ;
948916 ; AVX512VL-LABEL: shuffle_v4i64_1054:
949917 ; AVX512VL: # BB#0:
950 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
918 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
951919 ; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
952920 ; AVX512VL-NEXT: retq
953921 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32>
14231391 ;
14241392 ; AVX512VL-LABEL: concat_v4i64_0145_bc:
14251393 ; AVX512VL: # BB#0:
1426 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
1394 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
14271395 ; AVX512VL-NEXT: retq
14281396 %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32>
14291397 %a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32>
752752 }
753753
754754 define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
755 ; AVX1OR2-LABEL: shuffle_v8f32_3210ba98:
756 ; AVX1OR2: # BB#0:
757 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
758 ; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
759 ; AVX1OR2-NEXT: retq
760 ;
761 ; AVX512VL-LABEL: shuffle_v8f32_3210ba98:
762 ; AVX512VL: # BB#0:
763 ; AVX512VL-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0
764 ; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
765 ; AVX512VL-NEXT: retq
755 ; ALL-LABEL: shuffle_v8f32_3210ba98:
756 ; ALL: # BB#0:
757 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
758 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
759 ; ALL-NEXT: retq
766760 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
767761 ret <8 x float> %shuffle
768762 }
828822 }
829823
830824 define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
831 ; AVX1OR2-LABEL: shuffle_v8f32_ba983210:
832 ; AVX1OR2: # BB#0:
833 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
834 ; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
835 ; AVX1OR2-NEXT: retq
836 ;
837 ; AVX512VL-LABEL: shuffle_v8f32_ba983210:
838 ; AVX512VL: # BB#0:
839 ; AVX512VL-NEXT: vinsertf64x2 $1, %xmm0, %ymm1, %ymm0
840 ; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
841 ; AVX512VL-NEXT: retq
825 ; ALL-LABEL: shuffle_v8f32_ba983210:
826 ; ALL: # BB#0:
827 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
828 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
829 ; ALL-NEXT: retq
842830 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
843831 ret <8 x float> %shuffle
844832 }
862850 }
863851
864852 define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
865 ; AVX1OR2-LABEL: shuffle_v8f32_uuuu1111:
866 ; AVX1OR2: # BB#0:
867 ; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
868 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
869 ; AVX1OR2-NEXT: retq
870 ;
871 ; AVX512VL-LABEL: shuffle_v8f32_uuuu1111:
872 ; AVX512VL: # BB#0:
873 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
874 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
875 ; AVX512VL-NEXT: retq
853 ; ALL-LABEL: shuffle_v8f32_uuuu1111:
854 ; ALL: # BB#0:
855 ; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
856 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
857 ; ALL-NEXT: retq
876858 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
877859 ret <8 x float> %shuffle
878860 }
884866 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
885867 ; AVX1-NEXT: retq
886868 ;
887 ; AVX2-LABEL: shuffle_v8f32_44444444:
888 ; AVX2: # BB#0:
889 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
890 ; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
891 ; AVX2-NEXT: retq
892 ;
893 ; AVX512VL-LABEL: shuffle_v8f32_44444444:
894 ; AVX512VL: # BB#0:
895 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
896 ; AVX512VL-NEXT: vbroadcastss %xmm0, %ymm0
897 ; AVX512VL-NEXT: retq
869 ; AVX2OR512VL-LABEL: shuffle_v8f32_44444444:
870 ; AVX2OR512VL: # BB#0:
871 ; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
872 ; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0
873 ; AVX2OR512VL-NEXT: retq
898874 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
899875 ret <8 x float> %shuffle
900876 }
909885 }
910886
911887 define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
912 ; AVX1OR2-LABEL: shuffle_v8f32_uuuu3210:
913 ; AVX1OR2: # BB#0:
914 ; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
915 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
916 ; AVX1OR2-NEXT: retq
917 ;
918 ; AVX512VL-LABEL: shuffle_v8f32_uuuu3210:
919 ; AVX512VL: # BB#0:
920 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
921 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
922 ; AVX512VL-NEXT: retq
888 ; ALL-LABEL: shuffle_v8f32_uuuu3210:
889 ; ALL: # BB#0:
890 ; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
891 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
892 ; ALL-NEXT: retq
923893 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
924894 ret <8 x float> %shuffle
925895 }
926896
927897 define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) {
928 ; AVX1OR2-LABEL: shuffle_v8f32_uuuu1188:
929 ; AVX1OR2: # BB#0:
930 ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
931 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
932 ; AVX1OR2-NEXT: retq
933 ;
934 ; AVX512VL-LABEL: shuffle_v8f32_uuuu1188:
935 ; AVX512VL: # BB#0:
936 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
937 ; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
938 ; AVX512VL-NEXT: retq
898 ; ALL-LABEL: shuffle_v8f32_uuuu1188:
899 ; ALL: # BB#0:
900 ; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
901 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
902 ; ALL-NEXT: retq
939903 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
940904 ret <8 x float> %shuffle
941905 }
950914 }
951915
952916 define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
953 ; AVX1OR2-LABEL: shuffle_v8f32_5555uuuu:
954 ; AVX1OR2: # BB#0:
955 ; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0
956 ; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
957 ; AVX1OR2-NEXT: retq
958 ;
959 ; AVX512VL-LABEL: shuffle_v8f32_5555uuuu:
960 ; AVX512VL: # BB#0:
961 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
962 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
963 ; AVX512VL-NEXT: retq
917 ; ALL-LABEL: shuffle_v8f32_5555uuuu:
918 ; ALL: # BB#0:
919 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
920 ; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
921 ; ALL-NEXT: retq
964922 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
965923 ret <8 x float> %shuffle
966924 }
18991857 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
19001858 ; AVX1-NEXT: retq
19011859 ;
1902 ; AVX2-LABEL: shuffle_v8i32_3210ba98:
1903 ; AVX2: # BB#0:
1904 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1905 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1906 ; AVX2-NEXT: retq
1907 ;
1908 ; AVX512VL-LABEL: shuffle_v8i32_3210ba98:
1909 ; AVX512VL: # BB#0:
1910 ; AVX512VL-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0
1911 ; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1912 ; AVX512VL-NEXT: retq
1860 ; AVX2OR512VL-LABEL: shuffle_v8i32_3210ba98:
1861 ; AVX2OR512VL: # BB#0:
1862 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1863 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1864 ; AVX2OR512VL-NEXT: retq
19131865 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
19141866 ret <8 x i32> %shuffle
19151867 }
20461998 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
20471999 ; AVX1-NEXT: retq
20482000 ;
2049 ; AVX2-LABEL: shuffle_v8i32_uuuu1111:
2050 ; AVX2: # BB#0:
2051 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2052 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2053 ; AVX2-NEXT: retq
2054 ;
2055 ; AVX512VL-LABEL: shuffle_v8i32_uuuu1111:
2056 ; AVX512VL: # BB#0:
2057 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2058 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
2059 ; AVX512VL-NEXT: retq
2001 ; AVX2OR512VL-LABEL: shuffle_v8i32_uuuu1111:
2002 ; AVX2OR512VL: # BB#0:
2003 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2004 ; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2005 ; AVX2OR512VL-NEXT: retq
20602006 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
20612007 ret <8 x i32> %shuffle
20622008 }
20942040 ;
20952041 ; AVX512VL-LABEL: shuffle_v8i32_44444444:
20962042 ; AVX512VL: # BB#0:
2097 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
2043 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
20982044 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %ymm0
20992045 ; AVX512VL-NEXT: retq
21002046 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
21082054 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
21092055 ; AVX1-NEXT: retq
21102056 ;
2111 ; AVX2-LABEL: shuffle_v8i32_5555uuuu:
2112 ; AVX2: # BB#0:
2113 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2114 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2115 ; AVX2-NEXT: retq
2116 ;
2117 ; AVX512VL-LABEL: shuffle_v8i32_5555uuuu:
2118 ; AVX512VL: # BB#0:
2119 ; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
2120 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2121 ; AVX512VL-NEXT: retq
2057 ; AVX2OR512VL-LABEL: shuffle_v8i32_5555uuuu:
2058 ; AVX2OR512VL: # BB#0:
2059 ; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
2060 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2061 ; AVX2OR512VL-NEXT: retq
21222062 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
21232063 ret <8 x i32> %shuffle
21242064 }
642642 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
643643 ; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
644644 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
645 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
645 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
646646 ; AVX512VL-NEXT: vmovdqu %ymm0, (%rax)
647647 ; AVX512VL-NEXT: retq
648648 ;
700700 ; AVX512VL: # BB#0: # %entry
701701 ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
702702 ; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
703 ; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
703 ; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
704704 ; AVX512VL-NEXT: retq
705705 ;
706706 ; AVX512BW-LABEL: trunc2x4i64_8i32:
716716 ; AVX512BWVL: # BB#0: # %entry
717717 ; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
718718 ; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
719 ; AVX512BWVL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
719 ; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
720720 ; AVX512BWVL-NEXT: retq
721721 entry:
722722 %0 = trunc <4 x i64> %a to <4 x i32>