llvm.org GIT mirror llvm / acf8758
[X86] Make FeatureAVX512 imply FeatureF16C. The EVEX to VEX pass is already assuming this is true under AVX512VL. We had special patterns to use zmm instructions if VLX and F16C weren't available. Instead just make AVX512 imply F16C to make the EVEX to VEX behavior explicitly legal and remove the extra patterns. All known CPUs with AVX512 have F16C so this should safe for now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317521 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 3 years ago
6 changed file(s) with 732 addition(s) and 2003 deletion(s). Raw diff Collapse all Expand all
118118 def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
119119 "Enable three-operand fused multiple-add",
120120 [FeatureAVX]>;
121 def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
122 "Support 16-bit floating point conversion instructions",
123 [FeatureAVX]>;
121124 def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
122125 "Enable AVX-512 instructions",
123 [FeatureAVX2, FeatureFMA]>;
126 [FeatureAVX2, FeatureFMA, FeatureF16C]>;
124127 def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
125128 "Enable AVX-512 Exponential and Reciprocal Instructions",
126129 [FeatureAVX512]>;
176179 "Support MOVBE instruction">;
177180 def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
178181 "Support RDRAND instruction">;
179 def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
180 "Support 16-bit floating point conversion instructions",
181 [FeatureAVX]>;
182182 def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
183183 "Support FS/GS Base instructions">;
184184 def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
379379 // Special handling for half-precision floating point conversions.
380380 // If we don't have F16C support, then lower half float conversions
381381 // into library calls.
382 if (Subtarget.useSoftFloat() ||
383 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
382 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
384383 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
385384 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
386385 }
72647264 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
72657265 (f32 (COPY_TO_REGCLASS (VCVTPH2PSZ128rr
72667266 (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
7267 }
7268
7269 // Patterns for matching float to half-float conversion when AVX512 is supported
7270 // but F16C isn't. In that case we have to use 512-bit vectors.
7271 let Predicates = [HasAVX512, NoVLX, NoF16C] in {
7272 def : Pat<(fp_to_f16 FR32X:$src),
7273 (i16 (EXTRACT_SUBREG
7274 (VMOVPDI2DIZrr
7275 (v8i16 (EXTRACT_SUBREG
7276 (VCVTPS2PHZrr
7277 (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
7278 (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
7279 sub_xmm), 4), sub_xmm))), sub_16bit))>;
7280
7281 def : Pat<(f16_to_fp GR16:$src),
7282 (f32 (COPY_TO_REGCLASS
7283 (v4f32 (EXTRACT_SUBREG
7284 (VCVTPH2PSZrr
7285 (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
7286 (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
7287 sub_xmm)), sub_xmm)), FR32X))>;
7288
7289 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
7290 (f32 (COPY_TO_REGCLASS
7291 (v4f32 (EXTRACT_SUBREG
7292 (VCVTPH2PSZrr
7293 (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
7294 (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
7295 sub_xmm), 4)), sub_xmm)), FR32X))>;
72967267 }
72977268
72987269 // Unordered/Ordered scalar fp compare with Sea and set EFLAGS
849849 def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
850850 def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
851851 def HasF16C : Predicate<"Subtarget->hasF16C()">;
852 def NoF16C : Predicate<"!Subtarget->hasF16C()">;
853852 def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
854853 def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
855854 def HasBMI : Predicate<"Subtarget->hasBMI()">;
22872287 ; VEX-NEXT: popq %rax
22882288 ; VEX-NEXT: retq
22892289 ;
2290 ; AVX512F-LABEL: fptosi_2f16_to_4i32:
2291 ; AVX512F: # BB#0:
2292 ; AVX512F-NEXT: # kill: %XMM1 %XMM1 %ZMM1
2293 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
2294 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
2295 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
2296 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2297 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
2298 ; AVX512F-NEXT: vcvttss2si %xmm1, %rax
2299 ; AVX512F-NEXT: vmovq %rax, %xmm1
2300 ; AVX512F-NEXT: vcvttss2si %xmm0, %rax
2301 ; AVX512F-NEXT: vmovq %rax, %xmm0
2302 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2303 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2304 ; AVX512F-NEXT: vzeroupper
2305 ; AVX512F-NEXT: retq
2306 ;
2307 ; AVX512VL-LABEL: fptosi_2f16_to_4i32:
2308 ; AVX512VL: # BB#0:
2309 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2310 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
2311 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2312 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
2313 ; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
2314 ; AVX512VL-NEXT: vmovq %rax, %xmm1
2315 ; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
2316 ; AVX512VL-NEXT: vmovq %rax, %xmm0
2317 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2318 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2319 ; AVX512VL-NEXT: retq
2320 ;
2321 ; AVX512DQ-LABEL: fptosi_2f16_to_4i32:
2322 ; AVX512DQ: # BB#0:
2323 ; AVX512DQ-NEXT: # kill: %XMM1 %XMM1 %ZMM1
2324 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %ZMM0
2325 ; AVX512DQ-NEXT: vcvtps2ph $4, %zmm0, %ymm0
2326 ; AVX512DQ-NEXT: vcvtph2ps %ymm0, %zmm0
2327 ; AVX512DQ-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2328 ; AVX512DQ-NEXT: vcvtph2ps %ymm1, %zmm1
2329 ; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax
2330 ; AVX512DQ-NEXT: vmovq %rax, %xmm1
2331 ; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
2332 ; AVX512DQ-NEXT: vmovq %rax, %xmm0
2333 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2334 ; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2335 ; AVX512DQ-NEXT: vzeroupper
2336 ; AVX512DQ-NEXT: retq
2337 ;
2338 ; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
2339 ; AVX512VLDQ: # BB#0:
2340 ; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2341 ; AVX512VLDQ-NEXT: vcvtph2ps %xmm0, %xmm0
2342 ; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2343 ; AVX512VLDQ-NEXT: vcvtph2ps %xmm1, %xmm1
2344 ; AVX512VLDQ-NEXT: vcvttss2si %xmm1, %rax
2345 ; AVX512VLDQ-NEXT: vmovq %rax, %xmm1
2346 ; AVX512VLDQ-NEXT: vcvttss2si %xmm0, %rax
2347 ; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
2348 ; AVX512VLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2349 ; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2350 ; AVX512VLDQ-NEXT: retq
2290 ; AVX512-LABEL: fptosi_2f16_to_4i32:
2291 ; AVX512: # BB#0:
2292 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2293 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
2294 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2295 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
2296 ; AVX512-NEXT: vcvttss2si %xmm1, %rax
2297 ; AVX512-NEXT: vmovq %rax, %xmm1
2298 ; AVX512-NEXT: vcvttss2si %xmm0, %rax
2299 ; AVX512-NEXT: vmovq %rax, %xmm0
2300 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2301 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
2302 ; AVX512-NEXT: retq
23512303 %cvt = fptosi <2 x half> %a to <2 x i32>
23522304 %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32>
23532305 ret <4 x i32> %ext
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
44 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
55
66 ;
88 ;
99
1010 define float @cvt_i16_to_f32(i16 %a0) nounwind {
11 ; AVX1-LABEL: cvt_i16_to_f32:
12 ; AVX1: # BB#0:
13 ; AVX1-NEXT: movswl %di, %eax
14 ; AVX1-NEXT: vmovd %eax, %xmm0
15 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
16 ; AVX1-NEXT: retq
17 ;
18 ; AVX2-LABEL: cvt_i16_to_f32:
19 ; AVX2: # BB#0:
20 ; AVX2-NEXT: movswl %di, %eax
21 ; AVX2-NEXT: vmovd %eax, %xmm0
22 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
23 ; AVX2-NEXT: retq
24 ;
25 ; AVX512F-LABEL: cvt_i16_to_f32:
26 ; AVX512F: # BB#0:
27 ; AVX512F-NEXT: movswl %di, %eax
28 ; AVX512F-NEXT: vmovd %eax, %xmm0
29 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
30 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
31 ; AVX512F-NEXT: vzeroupper
32 ; AVX512F-NEXT: retq
33 ;
34 ; AVX512VL-LABEL: cvt_i16_to_f32:
35 ; AVX512VL: # BB#0:
36 ; AVX512VL-NEXT: movswl %di, %eax
37 ; AVX512VL-NEXT: vmovd %eax, %xmm0
38 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
39 ; AVX512VL-NEXT: retq
11 ; ALL-LABEL: cvt_i16_to_f32:
12 ; ALL: # BB#0:
13 ; ALL-NEXT: movswl %di, %eax
14 ; ALL-NEXT: vmovd %eax, %xmm0
15 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
16 ; ALL-NEXT: retq
4017 %1 = bitcast i16 %a0 to half
4118 %2 = fpext half %1 to float
4219 ret float %2
11087 ; AVX512F-NEXT: shrq $48, %rdx
11188 ; AVX512F-NEXT: movswl %dx, %edx
11289 ; AVX512F-NEXT: vmovd %edx, %xmm0
113 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
90 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
11491 ; AVX512F-NEXT: movswl %cx, %ecx
11592 ; AVX512F-NEXT: vmovd %ecx, %xmm1
116 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
93 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
11794 ; AVX512F-NEXT: cwtl
11895 ; AVX512F-NEXT: vmovd %eax, %xmm2
119 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
96 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
12097 ; AVX512F-NEXT: vmovd %esi, %xmm3
121 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
98 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
12299 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
123100 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
124101 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
125 ; AVX512F-NEXT: vzeroupper
126102 ; AVX512F-NEXT: retq
127103 ;
128104 ; AVX512VL-LABEL: cvt_4i16_to_4f32:
221197 ; AVX512F-NEXT: shrq $48, %rdx
222198 ; AVX512F-NEXT: movswl %dx, %edx
223199 ; AVX512F-NEXT: vmovd %edx, %xmm0
224 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
200 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
225201 ; AVX512F-NEXT: movswl %cx, %ecx
226202 ; AVX512F-NEXT: vmovd %ecx, %xmm1
227 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
203 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
228204 ; AVX512F-NEXT: cwtl
229205 ; AVX512F-NEXT: vmovd %eax, %xmm2
230 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
206 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
231207 ; AVX512F-NEXT: vmovd %esi, %xmm3
232 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
208 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
233209 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
234210 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
235211 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
236 ; AVX512F-NEXT: vzeroupper
237212 ; AVX512F-NEXT: retq
238213 ;
239214 ; AVX512VL-LABEL: cvt_8i16_to_4f32:
270245 }
271246
272247 define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
273 ; AVX1-LABEL: cvt_8i16_to_8f32:
274 ; AVX1: # BB#0:
275 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
276 ; AVX1-NEXT: movq %rdx, %r8
277 ; AVX1-NEXT: movq %rdx, %r10
278 ; AVX1-NEXT: movswl %dx, %r9d
279 ; AVX1-NEXT: # kill: %EDX %EDX %RDX
280 ; AVX1-NEXT: shrl $16, %edx
281 ; AVX1-NEXT: shrq $32, %r8
282 ; AVX1-NEXT: shrq $48, %r10
283 ; AVX1-NEXT: vmovq %xmm0, %rdi
284 ; AVX1-NEXT: movq %rdi, %rax
285 ; AVX1-NEXT: movq %rdi, %rsi
286 ; AVX1-NEXT: movswl %di, %ecx
287 ; AVX1-NEXT: # kill: %EDI %EDI %RDI
288 ; AVX1-NEXT: shrl $16, %edi
289 ; AVX1-NEXT: shrq $32, %rax
290 ; AVX1-NEXT: shrq $48, %rsi
291 ; AVX1-NEXT: movswl %si, %esi
292 ; AVX1-NEXT: vmovd %esi, %xmm0
293 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
294 ; AVX1-NEXT: cwtl
295 ; AVX1-NEXT: vmovd %eax, %xmm1
296 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
297 ; AVX1-NEXT: movswl %di, %eax
298 ; AVX1-NEXT: vmovd %eax, %xmm2
299 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
300 ; AVX1-NEXT: vmovd %ecx, %xmm3
301 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
302 ; AVX1-NEXT: movswl %r10w, %eax
303 ; AVX1-NEXT: vmovd %eax, %xmm4
304 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
305 ; AVX1-NEXT: movswl %r8w, %eax
306 ; AVX1-NEXT: vmovd %eax, %xmm5
307 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
308 ; AVX1-NEXT: movswl %dx, %eax
309 ; AVX1-NEXT: vmovd %eax, %xmm6
310 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
311 ; AVX1-NEXT: vmovd %r9d, %xmm7
312 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
313 ; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
314 ; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
315 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
316 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
317 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
318 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
319 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
320 ; AVX1-NEXT: retq
321 ;
322 ; AVX2-LABEL: cvt_8i16_to_8f32:
323 ; AVX2: # BB#0:
324 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
325 ; AVX2-NEXT: movq %rdx, %r8
326 ; AVX2-NEXT: movq %rdx, %r10
327 ; AVX2-NEXT: movswl %dx, %r9d
328 ; AVX2-NEXT: # kill: %EDX %EDX %RDX
329 ; AVX2-NEXT: shrl $16, %edx
330 ; AVX2-NEXT: shrq $32, %r8
331 ; AVX2-NEXT: shrq $48, %r10
332 ; AVX2-NEXT: vmovq %xmm0, %rdi
333 ; AVX2-NEXT: movq %rdi, %rax
334 ; AVX2-NEXT: movq %rdi, %rsi
335 ; AVX2-NEXT: movswl %di, %ecx
336 ; AVX2-NEXT: # kill: %EDI %EDI %RDI
337 ; AVX2-NEXT: shrl $16, %edi
338 ; AVX2-NEXT: shrq $32, %rax
339 ; AVX2-NEXT: shrq $48, %rsi
340 ; AVX2-NEXT: movswl %si, %esi
341 ; AVX2-NEXT: vmovd %esi, %xmm0
342 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
343 ; AVX2-NEXT: cwtl
344 ; AVX2-NEXT: vmovd %eax, %xmm1
345 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
346 ; AVX2-NEXT: movswl %di, %eax
347 ; AVX2-NEXT: vmovd %eax, %xmm2
348 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
349 ; AVX2-NEXT: vmovd %ecx, %xmm3
350 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
351 ; AVX2-NEXT: movswl %r10w, %eax
352 ; AVX2-NEXT: vmovd %eax, %xmm4
353 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
354 ; AVX2-NEXT: movswl %r8w, %eax
355 ; AVX2-NEXT: vmovd %eax, %xmm5
356 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
357 ; AVX2-NEXT: movswl %dx, %eax
358 ; AVX2-NEXT: vmovd %eax, %xmm6
359 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
360 ; AVX2-NEXT: vmovd %r9d, %xmm7
361 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
362 ; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
363 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
364 ; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
365 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
366 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
367 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
368 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
369 ; AVX2-NEXT: retq
370 ;
371 ; AVX512F-LABEL: cvt_8i16_to_8f32:
372 ; AVX512F: # BB#0:
373 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
374 ; AVX512F-NEXT: movq %rdx, %r8
375 ; AVX512F-NEXT: movq %rdx, %r9
376 ; AVX512F-NEXT: movswl %dx, %r10d
377 ; AVX512F-NEXT: # kill: %EDX %EDX %RDX
378 ; AVX512F-NEXT: shrl $16, %edx
379 ; AVX512F-NEXT: shrq $32, %r8
380 ; AVX512F-NEXT: shrq $48, %r9
381 ; AVX512F-NEXT: vmovq %xmm0, %rdi
382 ; AVX512F-NEXT: movq %rdi, %rax
383 ; AVX512F-NEXT: movq %rdi, %rcx
384 ; AVX512F-NEXT: movswl %di, %esi
385 ; AVX512F-NEXT: # kill: %EDI %EDI %RDI
386 ; AVX512F-NEXT: shrl $16, %edi
387 ; AVX512F-NEXT: shrq $32, %rax
388 ; AVX512F-NEXT: shrq $48, %rcx
389 ; AVX512F-NEXT: movswl %cx, %ecx
390 ; AVX512F-NEXT: vmovd %ecx, %xmm0
391 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
392 ; AVX512F-NEXT: cwtl
393 ; AVX512F-NEXT: vmovd %eax, %xmm1
394 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
395 ; AVX512F-NEXT: movswl %di, %eax
396 ; AVX512F-NEXT: vmovd %eax, %xmm2
397 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
398 ; AVX512F-NEXT: vmovd %esi, %xmm3
399 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
400 ; AVX512F-NEXT: movswl %r9w, %eax
401 ; AVX512F-NEXT: vmovd %eax, %xmm4
402 ; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
403 ; AVX512F-NEXT: movswl %r8w, %eax
404 ; AVX512F-NEXT: vmovd %eax, %xmm5
405 ; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
406 ; AVX512F-NEXT: movswl %dx, %eax
407 ; AVX512F-NEXT: vmovd %eax, %xmm6
408 ; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
409 ; AVX512F-NEXT: vmovd %r10d, %xmm7
410 ; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
411 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
412 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
413 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
414 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
415 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
416 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
417 ; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
418 ; AVX512F-NEXT: retq
419 ;
420 ; AVX512VL-LABEL: cvt_8i16_to_8f32:
421 ; AVX512VL: # BB#0:
422 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
423 ; AVX512VL-NEXT: movq %rdx, %r8
424 ; AVX512VL-NEXT: movq %rdx, %r10
425 ; AVX512VL-NEXT: movswl %dx, %r9d
426 ; AVX512VL-NEXT: # kill: %EDX %EDX %RDX
427 ; AVX512VL-NEXT: shrl $16, %edx
428 ; AVX512VL-NEXT: shrq $32, %r8
429 ; AVX512VL-NEXT: shrq $48, %r10
430 ; AVX512VL-NEXT: vmovq %xmm0, %rdi
431 ; AVX512VL-NEXT: movq %rdi, %rax
432 ; AVX512VL-NEXT: movq %rdi, %rsi
433 ; AVX512VL-NEXT: movswl %di, %ecx
434 ; AVX512VL-NEXT: # kill: %EDI %EDI %RDI
435 ; AVX512VL-NEXT: shrl $16, %edi
436 ; AVX512VL-NEXT: shrq $32, %rax
437 ; AVX512VL-NEXT: shrq $48, %rsi
438 ; AVX512VL-NEXT: movswl %si, %esi
439 ; AVX512VL-NEXT: vmovd %esi, %xmm0
440 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
441 ; AVX512VL-NEXT: cwtl
442 ; AVX512VL-NEXT: vmovd %eax, %xmm1
443 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
444 ; AVX512VL-NEXT: movswl %di, %eax
445 ; AVX512VL-NEXT: vmovd %eax, %xmm2
446 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
447 ; AVX512VL-NEXT: vmovd %ecx, %xmm3
448 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
449 ; AVX512VL-NEXT: movswl %r10w, %eax
450 ; AVX512VL-NEXT: vmovd %eax, %xmm4
451 ; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
452 ; AVX512VL-NEXT: movswl %r8w, %eax
453 ; AVX512VL-NEXT: vmovd %eax, %xmm5
454 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
455 ; AVX512VL-NEXT: movswl %dx, %eax
456 ; AVX512VL-NEXT: vmovd %eax, %xmm6
457 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
458 ; AVX512VL-NEXT: vmovd %r9d, %xmm7
459 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
460 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
461 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
462 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
463 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
464 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
465 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
466 ; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
467 ; AVX512VL-NEXT: retq
248 ; ALL-LABEL: cvt_8i16_to_8f32:
249 ; ALL: # BB#0:
250 ; ALL-NEXT: vpextrq $1, %xmm0, %rdx
251 ; ALL-NEXT: movq %rdx, %r8
252 ; ALL-NEXT: movq %rdx, %r10
253 ; ALL-NEXT: movswl %dx, %r9d
254 ; ALL-NEXT: # kill: %EDX %EDX %RDX
255 ; ALL-NEXT: shrl $16, %edx
256 ; ALL-NEXT: shrq $32, %r8
257 ; ALL-NEXT: shrq $48, %r10
258 ; ALL-NEXT: vmovq %xmm0, %rdi
259 ; ALL-NEXT: movq %rdi, %rax
260 ; ALL-NEXT: movq %rdi, %rsi
261 ; ALL-NEXT: movswl %di, %ecx
262 ; ALL-NEXT: # kill: %EDI %EDI %RDI
263 ; ALL-NEXT: shrl $16, %edi
264 ; ALL-NEXT: shrq $32, %rax
265 ; ALL-NEXT: shrq $48, %rsi
266 ; ALL-NEXT: movswl %si, %esi
267 ; ALL-NEXT: vmovd %esi, %xmm0
268 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
269 ; ALL-NEXT: cwtl
270 ; ALL-NEXT: vmovd %eax, %xmm1
271 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
272 ; ALL-NEXT: movswl %di, %eax
273 ; ALL-NEXT: vmovd %eax, %xmm2
274 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
275 ; ALL-NEXT: vmovd %ecx, %xmm3
276 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
277 ; ALL-NEXT: movswl %r10w, %eax
278 ; ALL-NEXT: vmovd %eax, %xmm4
279 ; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
280 ; ALL-NEXT: movswl %r8w, %eax
281 ; ALL-NEXT: vmovd %eax, %xmm5
282 ; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
283 ; ALL-NEXT: movswl %dx, %eax
284 ; ALL-NEXT: vmovd %eax, %xmm6
285 ; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
286 ; ALL-NEXT: vmovd %r9d, %xmm7
287 ; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
288 ; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
289 ; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
290 ; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
291 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
292 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
293 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
294 ; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
295 ; ALL-NEXT: retq
468296 %1 = bitcast <8 x i16> %a0 to <8 x half>
469297 %2 = fpext <8 x half> %1 to <8 x float>
470298 ret <8 x float> %2
663491 ;
664492 ; AVX512F-LABEL: cvt_16i16_to_16f32:
665493 ; AVX512F: # BB#0:
666 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
494 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10
667495 ; AVX512F-NEXT: vmovq %xmm0, %rax
668496 ; AVX512F-NEXT: movq %rax, %rcx
669497 ; AVX512F-NEXT: shrq $48, %rcx
670498 ; AVX512F-NEXT: movswl %cx, %ecx
671 ; AVX512F-NEXT: vmovd %ecx, %xmm2
499 ; AVX512F-NEXT: vmovd %ecx, %xmm8
672500 ; AVX512F-NEXT: movq %rax, %rcx
673501 ; AVX512F-NEXT: shrq $32, %rcx
674502 ; AVX512F-NEXT: movswl %cx, %ecx
503 ; AVX512F-NEXT: vmovd %ecx, %xmm9
504 ; AVX512F-NEXT: movswl %ax, %ecx
505 ; AVX512F-NEXT: # kill: %EAX %EAX %RAX
506 ; AVX512F-NEXT: shrl $16, %eax
507 ; AVX512F-NEXT: cwtl
508 ; AVX512F-NEXT: vmovd %eax, %xmm11
509 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
510 ; AVX512F-NEXT: vmovd %ecx, %xmm12
511 ; AVX512F-NEXT: movq %rax, %rcx
512 ; AVX512F-NEXT: shrq $48, %rcx
513 ; AVX512F-NEXT: movswl %cx, %ecx
514 ; AVX512F-NEXT: vmovd %ecx, %xmm13
515 ; AVX512F-NEXT: movq %rax, %rcx
516 ; AVX512F-NEXT: shrq $32, %rcx
517 ; AVX512F-NEXT: movswl %cx, %ecx
518 ; AVX512F-NEXT: vmovd %ecx, %xmm14
519 ; AVX512F-NEXT: movswl %ax, %ecx
520 ; AVX512F-NEXT: # kill: %EAX %EAX %RAX
521 ; AVX512F-NEXT: shrl $16, %eax
522 ; AVX512F-NEXT: cwtl
523 ; AVX512F-NEXT: vmovd %eax, %xmm15
524 ; AVX512F-NEXT: vmovq %xmm10, %rax
525 ; AVX512F-NEXT: vmovd %ecx, %xmm2
526 ; AVX512F-NEXT: movq %rax, %rcx
527 ; AVX512F-NEXT: shrq $48, %rcx
528 ; AVX512F-NEXT: movswl %cx, %ecx
675529 ; AVX512F-NEXT: vmovd %ecx, %xmm3
530 ; AVX512F-NEXT: movq %rax, %rcx
531 ; AVX512F-NEXT: shrq $32, %rcx
532 ; AVX512F-NEXT: movswl %cx, %ecx
533 ; AVX512F-NEXT: vmovd %ecx, %xmm1
676534 ; AVX512F-NEXT: movswl %ax, %ecx
677535 ; AVX512F-NEXT: # kill: %EAX %EAX %RAX
678536 ; AVX512F-NEXT: shrl $16, %eax
679537 ; AVX512F-NEXT: cwtl
680538 ; AVX512F-NEXT: vmovd %eax, %xmm4
681 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
682 ; AVX512F-NEXT: vmovd %ecx, %xmm0
539 ; AVX512F-NEXT: vpextrq $1, %xmm10, %rax
540 ; AVX512F-NEXT: vmovd %ecx, %xmm10
683541 ; AVX512F-NEXT: movq %rax, %rcx
684542 ; AVX512F-NEXT: shrq $48, %rcx
685543 ; AVX512F-NEXT: movswl %cx, %ecx
688546 ; AVX512F-NEXT: shrq $32, %rcx
689547 ; AVX512F-NEXT: movswl %cx, %ecx
690548 ; AVX512F-NEXT: vmovd %ecx, %xmm6
691 ; AVX512F-NEXT: movswl %ax, %ecx
692 ; AVX512F-NEXT: # kill: %EAX %EAX %RAX
693 ; AVX512F-NEXT: shrl $16, %eax
694 ; AVX512F-NEXT: cwtl
695 ; AVX512F-NEXT: vmovd %eax, %xmm7
696 ; AVX512F-NEXT: vmovq %xmm1, %rax
697 ; AVX512F-NEXT: vmovd %ecx, %xmm8
698 ; AVX512F-NEXT: movq %rax, %rcx
699 ; AVX512F-NEXT: shrq $48, %rcx
700 ; AVX512F-NEXT: movswl %cx, %ecx
701 ; AVX512F-NEXT: vmovd %ecx, %xmm9
702 ; AVX512F-NEXT: movq %rax, %rcx
703 ; AVX512F-NEXT: shrq $32, %rcx
704 ; AVX512F-NEXT: movswl %cx, %ecx
705 ; AVX512F-NEXT: vmovd %ecx, %xmm10
706 ; AVX512F-NEXT: movswl %ax, %ecx
707 ; AVX512F-NEXT: # kill: %EAX %EAX %RAX
708 ; AVX512F-NEXT: shrl $16, %eax
709 ; AVX512F-NEXT: cwtl
710 ; AVX512F-NEXT: vmovd %eax, %xmm11
711 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
712 ; AVX512F-NEXT: vmovd %ecx, %xmm1
713 ; AVX512F-NEXT: movq %rax, %rcx
714 ; AVX512F-NEXT: shrq $48, %rcx
715 ; AVX512F-NEXT: movswl %cx, %ecx
716 ; AVX512F-NEXT: vmovd %ecx, %xmm12
717 ; AVX512F-NEXT: movq %rax, %rcx
718 ; AVX512F-NEXT: shrq $32, %rcx
719 ; AVX512F-NEXT: movswl %cx, %ecx
720 ; AVX512F-NEXT: vmovd %ecx, %xmm13
721549 ; AVX512F-NEXT: movl %eax, %ecx
722550 ; AVX512F-NEXT: shrl $16, %ecx
723551 ; AVX512F-NEXT: movswl %cx, %ecx
724 ; AVX512F-NEXT: vmovd %ecx, %xmm14
552 ; AVX512F-NEXT: vmovd %ecx, %xmm7
725553 ; AVX512F-NEXT: cwtl
726 ; AVX512F-NEXT: vmovd %eax, %xmm15
727 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm16
728 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
729 ; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
730 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
731 ; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
732 ; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
733 ; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
734 ; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
735 ; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
736 ; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
737 ; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
738 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
739 ; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
740 ; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
741 ; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
742 ; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
743 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[2,3]
744 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0],xmm2[3]
745 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
746 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[2,3]
747 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3]
748 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[0]
749 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
750 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[2,3]
751 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0],xmm2[3]
752 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0]
753 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
754 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
755 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm16[0]
756 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
757 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
554 ; AVX512F-NEXT: vmovd %eax, %xmm0
555 ; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8
556 ; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9
557 ; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11
558 ; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12
559 ; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13
560 ; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14
561 ; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15
562 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
563 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
564 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
565 ; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
566 ; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10
567 ; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
568 ; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
569 ; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
570 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
571 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
572 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
573 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
574 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
575 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
576 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
577 ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
578 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
579 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
580 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
581 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
582 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
583 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
584 ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
585 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
758586 ; AVX512F-NEXT: retq
759587 ;
760588 ; AVX512VL-LABEL: cvt_16i16_to_16f32:
862690 ;
863691
864692 define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
865 ; AVX1-LABEL: load_cvt_i16_to_f32:
866 ; AVX1: # BB#0:
867 ; AVX1-NEXT: movswl (%rdi), %eax
868 ; AVX1-NEXT: vmovd %eax, %xmm0
869 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
870 ; AVX1-NEXT: retq
871 ;
872 ; AVX2-LABEL: load_cvt_i16_to_f32:
873 ; AVX2: # BB#0:
874 ; AVX2-NEXT: movswl (%rdi), %eax
875 ; AVX2-NEXT: vmovd %eax, %xmm0
876 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
877 ; AVX2-NEXT: retq
878 ;
879 ; AVX512F-LABEL: load_cvt_i16_to_f32:
880 ; AVX512F: # BB#0:
881 ; AVX512F-NEXT: movswl (%rdi), %eax
882 ; AVX512F-NEXT: vmovd %eax, %xmm0
883 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
884 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
885 ; AVX512F-NEXT: vzeroupper
886 ; AVX512F-NEXT: retq
887 ;
888 ; AVX512VL-LABEL: load_cvt_i16_to_f32:
889 ; AVX512VL: # BB#0:
890 ; AVX512VL-NEXT: movswl (%rdi), %eax
891 ; AVX512VL-NEXT: vmovd %eax, %xmm0
892 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
893 ; AVX512VL-NEXT: retq
693 ; ALL-LABEL: load_cvt_i16_to_f32:
694 ; ALL: # BB#0:
695 ; ALL-NEXT: movswl (%rdi), %eax
696 ; ALL-NEXT: vmovd %eax, %xmm0
697 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
698 ; ALL-NEXT: retq
894699 %1 = load i16, i16* %a0
895700 %2 = bitcast i16 %1 to half
896701 %3 = fpext half %2 to float
898703 }
899704
900705 define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
901 ; AVX1-LABEL: load_cvt_4i16_to_4f32:
902 ; AVX1: # BB#0:
903 ; AVX1-NEXT: movswl 6(%rdi), %eax
904 ; AVX1-NEXT: vmovd %eax, %xmm0
905 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
906 ; AVX1-NEXT: movswl 4(%rdi), %eax
907 ; AVX1-NEXT: vmovd %eax, %xmm1
908 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
909 ; AVX1-NEXT: movswl (%rdi), %eax
910 ; AVX1-NEXT: vmovd %eax, %xmm2
911 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
912 ; AVX1-NEXT: movswl 2(%rdi), %eax
913 ; AVX1-NEXT: vmovd %eax, %xmm3
914 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
915 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
916 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
917 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
918 ; AVX1-NEXT: retq
919 ;
920 ; AVX2-LABEL: load_cvt_4i16_to_4f32:
921 ; AVX2: # BB#0:
922 ; AVX2-NEXT: movswl 6(%rdi), %eax
923 ; AVX2-NEXT: vmovd %eax, %xmm0
924 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
925 ; AVX2-NEXT: movswl 4(%rdi), %eax
926 ; AVX2-NEXT: vmovd %eax, %xmm1
927 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
928 ; AVX2-NEXT: movswl (%rdi), %eax
929 ; AVX2-NEXT: vmovd %eax, %xmm2
930 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
931 ; AVX2-NEXT: movswl 2(%rdi), %eax
932 ; AVX2-NEXT: vmovd %eax, %xmm3
933 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
934 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
935 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
936 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
937 ; AVX2-NEXT: retq
938 ;
939 ; AVX512F-LABEL: load_cvt_4i16_to_4f32:
940 ; AVX512F: # BB#0:
941 ; AVX512F-NEXT: movswl 6(%rdi), %eax
942 ; AVX512F-NEXT: vmovd %eax, %xmm0
943 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
944 ; AVX512F-NEXT: movswl 4(%rdi), %eax
945 ; AVX512F-NEXT: vmovd %eax, %xmm1
946 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
947 ; AVX512F-NEXT: movswl (%rdi), %eax
948 ; AVX512F-NEXT: vmovd %eax, %xmm2
949 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
950 ; AVX512F-NEXT: movswl 2(%rdi), %eax
951 ; AVX512F-NEXT: vmovd %eax, %xmm3
952 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
953 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
954 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
955 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
956 ; AVX512F-NEXT: vzeroupper
957 ; AVX512F-NEXT: retq
958 ;
959 ; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
960 ; AVX512VL: # BB#0:
961 ; AVX512VL-NEXT: movswl 6(%rdi), %eax
962 ; AVX512VL-NEXT: vmovd %eax, %xmm0
963 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
964 ; AVX512VL-NEXT: movswl 4(%rdi), %eax
965 ; AVX512VL-NEXT: vmovd %eax, %xmm1
966 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
967 ; AVX512VL-NEXT: movswl (%rdi), %eax
968 ; AVX512VL-NEXT: vmovd %eax, %xmm2
969 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
970 ; AVX512VL-NEXT: movswl 2(%rdi), %eax
971 ; AVX512VL-NEXT: vmovd %eax, %xmm3
972 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
973 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
974 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
975 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
976 ; AVX512VL-NEXT: retq
706 ; ALL-LABEL: load_cvt_4i16_to_4f32:
707 ; ALL: # BB#0:
708 ; ALL-NEXT: movswl 6(%rdi), %eax
709 ; ALL-NEXT: vmovd %eax, %xmm0
710 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
711 ; ALL-NEXT: movswl 4(%rdi), %eax
712 ; ALL-NEXT: vmovd %eax, %xmm1
713 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
714 ; ALL-NEXT: movswl (%rdi), %eax
715 ; ALL-NEXT: vmovd %eax, %xmm2
716 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
717 ; ALL-NEXT: movswl 2(%rdi), %eax
718 ; ALL-NEXT: vmovd %eax, %xmm3
719 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
720 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
721 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
722 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
723 ; ALL-NEXT: retq
977724 %1 = load <4 x i16>, <4 x i16>* %a0
978725 %2 = bitcast <4 x i16> %1 to <4 x half>
979726 %3 = fpext <4 x half> %2 to <4 x float>
1045792 ; AVX512F-NEXT: shrq $48, %rdx
1046793 ; AVX512F-NEXT: movswl %dx, %edx
1047794 ; AVX512F-NEXT: vmovd %edx, %xmm0
1048 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
795 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
1049796 ; AVX512F-NEXT: movswl %cx, %ecx
1050797 ; AVX512F-NEXT: vmovd %ecx, %xmm1
1051 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
798 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
1052799 ; AVX512F-NEXT: cwtl
1053800 ; AVX512F-NEXT: vmovd %eax, %xmm2
1054 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
801 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
1055802 ; AVX512F-NEXT: vmovd %esi, %xmm3
1056 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
803 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
1057804 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1058805 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1059806 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1060 ; AVX512F-NEXT: vzeroupper
1061807 ; AVX512F-NEXT: retq
1062808 ;
1063809 ; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
1095841 }
1096842
1097843 define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
1098 ; AVX1-LABEL: load_cvt_8i16_to_8f32:
1099 ; AVX1: # BB#0:
1100 ; AVX1-NEXT: movswl 6(%rdi), %eax
1101 ; AVX1-NEXT: vmovd %eax, %xmm0
1102 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
1103 ; AVX1-NEXT: movswl 4(%rdi), %eax
1104 ; AVX1-NEXT: vmovd %eax, %xmm1
1105 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
1106 ; AVX1-NEXT: movswl (%rdi), %eax
1107 ; AVX1-NEXT: vmovd %eax, %xmm2
1108 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
1109 ; AVX1-NEXT: movswl 2(%rdi), %eax
1110 ; AVX1-NEXT: vmovd %eax, %xmm3
1111 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
1112 ; AVX1-NEXT: movswl 14(%rdi), %eax
1113 ; AVX1-NEXT: vmovd %eax, %xmm4
1114 ; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
1115 ; AVX1-NEXT: movswl 12(%rdi), %eax
1116 ; AVX1-NEXT: vmovd %eax, %xmm5
1117 ; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
1118 ; AVX1-NEXT: movswl 8(%rdi), %eax
1119 ; AVX1-NEXT: vmovd %eax, %xmm6
1120 ; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
1121 ; AVX1-NEXT: movswl 10(%rdi), %eax
1122 ; AVX1-NEXT: vmovd %eax, %xmm7
1123 ; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
1124 ; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1125 ; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1126 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1127 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1128 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1129 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1130 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1131 ; AVX1-NEXT: retq
1132 ;
1133 ; AVX2-LABEL: load_cvt_8i16_to_8f32:
1134 ; AVX2: # BB#0:
1135 ; AVX2-NEXT: movswl 6(%rdi), %eax
1136 ; AVX2-NEXT: vmovd %eax, %xmm0
1137 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
1138 ; AVX2-NEXT: movswl 4(%rdi), %eax
1139 ; AVX2-NEXT: vmovd %eax, %xmm1
1140 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
1141 ; AVX2-NEXT: movswl (%rdi), %eax
1142 ; AVX2-NEXT: vmovd %eax, %xmm2
1143 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
1144 ; AVX2-NEXT: movswl 2(%rdi), %eax
1145 ; AVX2-NEXT: vmovd %eax, %xmm3
1146 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
1147 ; AVX2-NEXT: movswl 14(%rdi), %eax
1148 ; AVX2-NEXT: vmovd %eax, %xmm4
1149 ; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
1150 ; AVX2-NEXT: movswl 12(%rdi), %eax
1151 ; AVX2-NEXT: vmovd %eax, %xmm5
1152 ; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
1153 ; AVX2-NEXT: movswl 8(%rdi), %eax
1154 ; AVX2-NEXT: vmovd %eax, %xmm6
1155 ; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
1156 ; AVX2-NEXT: movswl 10(%rdi), %eax
1157 ; AVX2-NEXT: vmovd %eax, %xmm7
1158 ; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
1159 ; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1160 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1161 ; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1162 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1163 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1164 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1165 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1166 ; AVX2-NEXT: retq
1167 ;
1168 ; AVX512F-LABEL: load_cvt_8i16_to_8f32:
1169 ; AVX512F: # BB#0:
1170 ; AVX512F-NEXT: movswl 6(%rdi), %eax
1171 ; AVX512F-NEXT: vmovd %eax, %xmm0
1172 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
1173 ; AVX512F-NEXT: movswl 4(%rdi), %eax
1174 ; AVX512F-NEXT: vmovd %eax, %xmm1
1175 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
1176 ; AVX512F-NEXT: movswl (%rdi), %eax
1177 ; AVX512F-NEXT: vmovd %eax, %xmm2
1178 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
1179 ; AVX512F-NEXT: movswl 2(%rdi), %eax
1180 ; AVX512F-NEXT: vmovd %eax, %xmm3
1181 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
1182 ; AVX512F-NEXT: movswl 14(%rdi), %eax
1183 ; AVX512F-NEXT: vmovd %eax, %xmm4
1184 ; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
1185 ; AVX512F-NEXT: movswl 12(%rdi), %eax
1186 ; AVX512F-NEXT: vmovd %eax, %xmm5
1187 ; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
1188 ; AVX512F-NEXT: movswl 8(%rdi), %eax
1189 ; AVX512F-NEXT: vmovd %eax, %xmm6
1190 ; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
1191 ; AVX512F-NEXT: movswl 10(%rdi), %eax
1192 ; AVX512F-NEXT: vmovd %eax, %xmm7
1193 ; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
1194 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1195 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1196 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1197 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1198 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1199 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1200 ; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1201 ; AVX512F-NEXT: retq
1202 ;
1203 ; AVX512VL-LABEL: load_cvt_8i16_to_8f32:
1204 ; AVX512VL: # BB#0:
1205 ; AVX512VL-NEXT: movswl 6(%rdi), %eax
1206 ; AVX512VL-NEXT: vmovd %eax, %xmm0
1207 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
1208 ; AVX512VL-NEXT: movswl 4(%rdi), %eax
1209 ; AVX512VL-NEXT: vmovd %eax, %xmm1
1210 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
1211 ; AVX512VL-NEXT: movswl (%rdi), %eax
1212 ; AVX512VL-NEXT: vmovd %eax, %xmm2
1213 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
1214 ; AVX512VL-NEXT: movswl 2(%rdi), %eax
1215 ; AVX512VL-NEXT: vmovd %eax, %xmm3
1216 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
1217 ; AVX512VL-NEXT: movswl 14(%rdi), %eax
1218 ; AVX512VL-NEXT: vmovd %eax, %xmm4
1219 ; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
1220 ; AVX512VL-NEXT: movswl 12(%rdi), %eax
1221 ; AVX512VL-NEXT: vmovd %eax, %xmm5
1222 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
1223 ; AVX512VL-NEXT: movswl 8(%rdi), %eax
1224 ; AVX512VL-NEXT: vmovd %eax, %xmm6
1225 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
1226 ; AVX512VL-NEXT: movswl 10(%rdi), %eax
1227 ; AVX512VL-NEXT: vmovd %eax, %xmm7
1228 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
1229 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1230 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1231 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
1232 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1233 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1234 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1235 ; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1236 ; AVX512VL-NEXT: retq
844 ; ALL-LABEL: load_cvt_8i16_to_8f32:
845 ; ALL: # BB#0:
846 ; ALL-NEXT: movswl 6(%rdi), %eax
847 ; ALL-NEXT: vmovd %eax, %xmm0
848 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
849 ; ALL-NEXT: movswl 4(%rdi), %eax
850 ; ALL-NEXT: vmovd %eax, %xmm1
851 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
852 ; ALL-NEXT: movswl (%rdi), %eax
853 ; ALL-NEXT: vmovd %eax, %xmm2
854 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
855 ; ALL-NEXT: movswl 2(%rdi), %eax
856 ; ALL-NEXT: vmovd %eax, %xmm3
857 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
858 ; ALL-NEXT: movswl 14(%rdi), %eax
859 ; ALL-NEXT: vmovd %eax, %xmm4
860 ; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
861 ; ALL-NEXT: movswl 12(%rdi), %eax
862 ; ALL-NEXT: vmovd %eax, %xmm5
863 ; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
864 ; ALL-NEXT: movswl 8(%rdi), %eax
865 ; ALL-NEXT: vmovd %eax, %xmm6
866 ; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
867 ; ALL-NEXT: movswl 10(%rdi), %eax
868 ; ALL-NEXT: vmovd %eax, %xmm7
869 ; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
870 ; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
871 ; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
872 ; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
873 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
874 ; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
875 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
876 ; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
877 ; ALL-NEXT: retq
1237878 %1 = load <8 x i16>, <8 x i16>* %a0
1238879 %2 = bitcast <8 x i16> %1 to <8 x half>
1239880 %3 = fpext <8 x half> %2 to <8 x float>
13771018 ; AVX512F: # BB#0:
13781019 ; AVX512F-NEXT: movswl 6(%rdi), %eax
13791020 ; AVX512F-NEXT: vmovd %eax, %xmm0
1380 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm16
1021 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8
13811022 ; AVX512F-NEXT: movswl 4(%rdi), %eax
1023 ; AVX512F-NEXT: vmovd %eax, %xmm0
1024 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9
1025 ; AVX512F-NEXT: movswl (%rdi), %eax
1026 ; AVX512F-NEXT: vmovd %eax, %xmm0
1027 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10
1028 ; AVX512F-NEXT: movswl 2(%rdi), %eax
1029 ; AVX512F-NEXT: vmovd %eax, %xmm0
1030 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11
1031 ; AVX512F-NEXT: movswl 14(%rdi), %eax
1032 ; AVX512F-NEXT: vmovd %eax, %xmm0
1033 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12
1034 ; AVX512F-NEXT: movswl 12(%rdi), %eax
1035 ; AVX512F-NEXT: vmovd %eax, %xmm0
1036 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13
1037 ; AVX512F-NEXT: movswl 8(%rdi), %eax
1038 ; AVX512F-NEXT: vmovd %eax, %xmm0
1039 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14
1040 ; AVX512F-NEXT: movswl 10(%rdi), %eax
1041 ; AVX512F-NEXT: vmovd %eax, %xmm0
1042 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15
1043 ; AVX512F-NEXT: movswl 22(%rdi), %eax
1044 ; AVX512F-NEXT: vmovd %eax, %xmm0
1045 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
1046 ; AVX512F-NEXT: movswl 20(%rdi), %eax
13821047 ; AVX512F-NEXT: vmovd %eax, %xmm1
1383 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm17
1384 ; AVX512F-NEXT: movswl (%rdi), %eax
1048 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
1049 ; AVX512F-NEXT: movswl 16(%rdi), %eax
13851050 ; AVX512F-NEXT: vmovd %eax, %xmm2
1386 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
1387 ; AVX512F-NEXT: movswl 2(%rdi), %eax
1051 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
1052 ; AVX512F-NEXT: movswl 18(%rdi), %eax
13881053 ; AVX512F-NEXT: vmovd %eax, %xmm3
1389 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
1390 ; AVX512F-NEXT: movswl 14(%rdi), %eax
1054 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
1055 ; AVX512F-NEXT: movswl 30(%rdi), %eax
13911056 ; AVX512F-NEXT: vmovd %eax, %xmm4
1392 ; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
1393 ; AVX512F-NEXT: movswl 12(%rdi), %eax
1057 ; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
1058 ; AVX512F-NEXT: movswl 28(%rdi), %eax
13941059 ; AVX512F-NEXT: vmovd %eax, %xmm5
1395 ; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
1396 ; AVX512F-NEXT: movswl 8(%rdi), %eax
1060 ; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
1061 ; AVX512F-NEXT: movswl 24(%rdi), %eax
13971062 ; AVX512F-NEXT: vmovd %eax, %xmm6
1398 ; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
1399 ; AVX512F-NEXT: movswl 10(%rdi), %eax
1063 ; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
1064 ; AVX512F-NEXT: movswl 26(%rdi), %eax
14001065 ; AVX512F-NEXT: vmovd %eax, %xmm7
1401 ; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
1402 ; AVX512F-NEXT: movswl 22(%rdi), %eax
1403 ; AVX512F-NEXT: vmovd %eax, %xmm8
1404 ; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
1405 ; AVX512F-NEXT: movswl 20(%rdi), %eax
1406 ; AVX512F-NEXT: vmovd %eax, %xmm9
1407 ; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
1408 ; AVX512F-NEXT: movswl 16(%rdi), %eax
1409 ; AVX512F-NEXT: vmovd %eax, %xmm10
1410 ; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
1411 ; AVX512F-NEXT: movswl 18(%rdi), %eax
1412 ; AVX512F-NEXT: vmovd %eax, %xmm11
1413 ; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
1414 ; AVX512F-NEXT: movswl 30(%rdi), %eax
1415 ; AVX512F-NEXT: vmovd %eax, %xmm12
1416 ; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
1417 ; AVX512F-NEXT: movswl 28(%rdi), %eax
1418 ; AVX512F-NEXT: vmovd %eax, %xmm13
1419 ; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
1420 ; AVX512F-NEXT: movswl 24(%rdi), %eax
1421 ; AVX512F-NEXT: vmovd %eax, %xmm14
1422 ; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
1423 ; AVX512F-NEXT: movswl 26(%rdi), %eax
1424 ; AVX512F-NEXT: vmovd %eax, %xmm15
1425 ; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
1426 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[2,3]
1427 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0],xmm0[3]
1428 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0]
1429 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[2,3]
1430 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
1431 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
1432 ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1433 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[2,3]
1434 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
1435 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
1066 ; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
1067 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
1068 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
1069 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
14361070 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
1437 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm17[0],xmm2[3]
1438 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm16[0]
1071 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1072 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1073 ; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
1074 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
1075 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
1076 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
1077 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
1078 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
1079 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
14391080 ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
14401081 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
14411082 ; AVX512F-NEXT: retq
15171158 ;
15181159
15191160 define double @cvt_i16_to_f64(i16 %a0) nounwind {
1520 ; AVX1-LABEL: cvt_i16_to_f64:
1521 ; AVX1: # BB#0:
1522 ; AVX1-NEXT: movswl %di, %eax
1523 ; AVX1-NEXT: vmovd %eax, %xmm0
1524 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
1525 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1526 ; AVX1-NEXT: retq
1527 ;
1528 ; AVX2-LABEL: cvt_i16_to_f64:
1529 ; AVX2: # BB#0:
1530 ; AVX2-NEXT: movswl %di, %eax
1531 ; AVX2-NEXT: vmovd %eax, %xmm0
1532 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
1533 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1534 ; AVX2-NEXT: retq
1535 ;
1536 ; AVX512F-LABEL: cvt_i16_to_f64:
1537 ; AVX512F: # BB#0:
1538 ; AVX512F-NEXT: movswl %di, %eax
1539 ; AVX512F-NEXT: vmovd %eax, %xmm0
1540 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
1541 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1542 ; AVX512F-NEXT: vzeroupper
1543 ; AVX512F-NEXT: retq
1544 ;
1545 ; AVX512VL-LABEL: cvt_i16_to_f64:
1546 ; AVX512VL: # BB#0:
1547 ; AVX512VL-NEXT: movswl %di, %eax
1548 ; AVX512VL-NEXT: vmovd %eax, %xmm0
1549 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
1550 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1551 ; AVX512VL-NEXT: retq
1161 ; ALL-LABEL: cvt_i16_to_f64:
1162 ; ALL: # BB#0:
1163 ; ALL-NEXT: movswl %di, %eax
1164 ; ALL-NEXT: vmovd %eax, %xmm0
1165 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1166 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1167 ; ALL-NEXT: retq
15521168 %1 = bitcast i16 %a0 to half
15531169 %2 = fpext half %1 to double
15541170 ret double %2
15981214 ; AVX512F-NEXT: shrl $16, %eax
15991215 ; AVX512F-NEXT: cwtl
16001216 ; AVX512F-NEXT: vmovd %eax, %xmm0
1601 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
1217 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
16021218 ; AVX512F-NEXT: vmovd %ecx, %xmm1
1603 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
1219 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
16041220 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
16051221 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
16061222 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1607 ; AVX512F-NEXT: vzeroupper
16081223 ; AVX512F-NEXT: retq
16091224 ;
16101225 ; AVX512VL-LABEL: cvt_2i16_to_2f64:
17001315 ; AVX512F-NEXT: shrl $16, %edx
17011316 ; AVX512F-NEXT: movswl %dx, %edx
17021317 ; AVX512F-NEXT: vmovd %edx, %xmm0
1703 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
1318 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
17041319 ; AVX512F-NEXT: vmovd %esi, %xmm1
1705 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
1320 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
17061321 ; AVX512F-NEXT: movswl %cx, %ecx
17071322 ; AVX512F-NEXT: vmovd %ecx, %xmm2
1708 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
1323 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
17091324 ; AVX512F-NEXT: cwtl
17101325 ; AVX512F-NEXT: vmovd %eax, %xmm3
1711 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
1326 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
17121327 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
17131328 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
17141329 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
17901405 ; AVX512F-NEXT: shrl $16, %eax
17911406 ; AVX512F-NEXT: cwtl
17921407 ; AVX512F-NEXT: vmovd %eax, %xmm0
1793 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
1408 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
17941409 ; AVX512F-NEXT: vmovd %ecx, %xmm1
1795 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
1410 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
17961411 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
17971412 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
17981413 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1799 ; AVX512F-NEXT: vzeroupper
18001414 ; AVX512F-NEXT: retq
18011415 ;
18021416 ; AVX512VL-LABEL: cvt_8i16_to_2f64:
18911505 ; AVX512F-NEXT: shrl $16, %edx
18921506 ; AVX512F-NEXT: movswl %dx, %edx
18931507 ; AVX512F-NEXT: vmovd %edx, %xmm0
1894 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
1508 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
18951509 ; AVX512F-NEXT: vmovd %esi, %xmm1
1896 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
1510 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
18971511 ; AVX512F-NEXT: movswl %cx, %ecx
18981512 ; AVX512F-NEXT: vmovd %ecx, %xmm2
1899 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
1513 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
19001514 ; AVX512F-NEXT: cwtl
19011515 ; AVX512F-NEXT: vmovd %eax, %xmm3
1902 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
1516 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
19031517 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
19041518 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
19051519 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
20541668 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
20551669 ; AVX2-NEXT: retq
20561670 ;
2057 ; AVX512F-LABEL: cvt_8i16_to_8f64:
2058 ; AVX512F: # BB#0:
2059 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
2060 ; AVX512F-NEXT: movq %rdx, %r8
2061 ; AVX512F-NEXT: movl %edx, %r9d
2062 ; AVX512F-NEXT: movswl %dx, %r10d
2063 ; AVX512F-NEXT: shrq $48, %rdx
2064 ; AVX512F-NEXT: shrq $32, %r8
2065 ; AVX512F-NEXT: shrl $16, %r9d
2066 ; AVX512F-NEXT: vmovq %xmm0, %rdi
2067 ; AVX512F-NEXT: movq %rdi, %rax
2068 ; AVX512F-NEXT: movl %edi, %ecx
2069 ; AVX512F-NEXT: movswl %di, %esi
2070 ; AVX512F-NEXT: shrq $48, %rdi
2071 ; AVX512F-NEXT: shrq $32, %rax
2072 ; AVX512F-NEXT: shrl $16, %ecx
2073 ; AVX512F-NEXT: movswl %cx, %ecx
2074 ; AVX512F-NEXT: vmovd %ecx, %xmm0
2075 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
2076 ; AVX512F-NEXT: vmovd %esi, %xmm1
2077 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
2078 ; AVX512F-NEXT: cwtl
2079 ; AVX512F-NEXT: vmovd %eax, %xmm2
2080 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
2081 ; AVX512F-NEXT: movswl %di, %eax
2082 ; AVX512F-NEXT: vmovd %eax, %xmm3
2083 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
2084 ; AVX512F-NEXT: movswl %r9w, %eax
2085 ; AVX512F-NEXT: vmovd %eax, %xmm4
2086 ; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
2087 ; AVX512F-NEXT: vmovd %r10d, %xmm5
2088 ; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
2089 ; AVX512F-NEXT: movswl %r8w, %eax
2090 ; AVX512F-NEXT: vmovd %eax, %xmm6
2091 ; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
2092 ; AVX512F-NEXT: movswl %dx, %eax
2093 ; AVX512F-NEXT: vmovd %eax, %xmm7
2094 ; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
2095 ; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
2096 ; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
2097 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2098 ; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
2099 ; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
2100 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
2101 ; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
2102 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2103 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2104 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2105 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2106 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2107 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2108 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2109 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2110 ; AVX512F-NEXT: retq
2111 ;
2112 ; AVX512VL-LABEL: cvt_8i16_to_8f64:
2113 ; AVX512VL: # BB#0:
2114 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
2115 ; AVX512VL-NEXT: movq %rdx, %r8
2116 ; AVX512VL-NEXT: movl %edx, %r10d
2117 ; AVX512VL-NEXT: movswl %dx, %r9d
2118 ; AVX512VL-NEXT: shrq $48, %rdx
2119 ; AVX512VL-NEXT: shrq $32, %r8
2120 ; AVX512VL-NEXT: shrl $16, %r10d
2121 ; AVX512VL-NEXT: vmovq %xmm0, %rdi
2122 ; AVX512VL-NEXT: movq %rdi, %rax
2123 ; AVX512VL-NEXT: movl %edi, %esi
2124 ; AVX512VL-NEXT: movswl %di, %ecx
2125 ; AVX512VL-NEXT: shrq $48, %rdi
2126 ; AVX512VL-NEXT: shrq $32, %rax
2127 ; AVX512VL-NEXT: shrl $16, %esi
2128 ; AVX512VL-NEXT: movswl %si, %esi
2129 ; AVX512VL-NEXT: vmovd %esi, %xmm0
2130 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
2131 ; AVX512VL-NEXT: vmovd %ecx, %xmm1
2132 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
2133 ; AVX512VL-NEXT: cwtl
2134 ; AVX512VL-NEXT: vmovd %eax, %xmm2
2135 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
2136 ; AVX512VL-NEXT: movswl %di, %eax
2137 ; AVX512VL-NEXT: vmovd %eax, %xmm3
2138 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
2139 ; AVX512VL-NEXT: movswl %r10w, %eax
2140 ; AVX512VL-NEXT: vmovd %eax, %xmm4
2141 ; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
2142 ; AVX512VL-NEXT: vmovd %r9d, %xmm5
2143 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
2144 ; AVX512VL-NEXT: movswl %r8w, %eax
2145 ; AVX512VL-NEXT: vmovd %eax, %xmm6
2146 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
2147 ; AVX512VL-NEXT: movswl %dx, %eax
2148 ; AVX512VL-NEXT: vmovd %eax, %xmm7
2149 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
2150 ; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
2151 ; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
2152 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2153 ; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
2154 ; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
2155 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
2156 ; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
2157 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2158 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2159 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2160 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2161 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2162 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2163 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2164 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2165 ; AVX512VL-NEXT: retq
1671 ; AVX512-LABEL: cvt_8i16_to_8f64:
1672 ; AVX512: # BB#0:
1673 ; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
1674 ; AVX512-NEXT: movq %rdx, %r8
1675 ; AVX512-NEXT: movl %edx, %r10d
1676 ; AVX512-NEXT: movswl %dx, %r9d
1677 ; AVX512-NEXT: shrq $48, %rdx
1678 ; AVX512-NEXT: shrq $32, %r8
1679 ; AVX512-NEXT: shrl $16, %r10d
1680 ; AVX512-NEXT: vmovq %xmm0, %rdi
1681 ; AVX512-NEXT: movq %rdi, %rax
1682 ; AVX512-NEXT: movl %edi, %esi
1683 ; AVX512-NEXT: movswl %di, %ecx
1684 ; AVX512-NEXT: shrq $48, %rdi
1685 ; AVX512-NEXT: shrq $32, %rax
1686 ; AVX512-NEXT: shrl $16, %esi
1687 ; AVX512-NEXT: movswl %si, %esi
1688 ; AVX512-NEXT: vmovd %esi, %xmm0
1689 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
1690 ; AVX512-NEXT: vmovd %ecx, %xmm1
1691 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
1692 ; AVX512-NEXT: cwtl
1693 ; AVX512-NEXT: vmovd %eax, %xmm2
1694 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
1695 ; AVX512-NEXT: movswl %di, %eax
1696 ; AVX512-NEXT: vmovd %eax, %xmm3
1697 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
1698 ; AVX512-NEXT: movswl %r10w, %eax
1699 ; AVX512-NEXT: vmovd %eax, %xmm4
1700 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
1701 ; AVX512-NEXT: vmovd %r9d, %xmm5
1702 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
1703 ; AVX512-NEXT: movswl %r8w, %eax
1704 ; AVX512-NEXT: vmovd %eax, %xmm6
1705 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
1706 ; AVX512-NEXT: movswl %dx, %eax
1707 ; AVX512-NEXT: vmovd %eax, %xmm7
1708 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
1709 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
1710 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
1711 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
1712 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
1713 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
1714 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
1715 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
1716 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1717 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1718 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1719 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1720 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1721 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1722 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1723 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
1724 ; AVX512-NEXT: retq
21661725 %1 = bitcast <8 x i16> %a0 to <8 x half>
21671726 %2 = fpext <8 x half> %1 to <8 x double>
21681727 ret <8 x double> %2
21731732 ;
21741733
21751734 define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
2176 ; AVX1-LABEL: load_cvt_i16_to_f64:
2177 ; AVX1: # BB#0:
2178 ; AVX1-NEXT: movswl (%rdi), %eax
2179 ; AVX1-NEXT: vmovd %eax, %xmm0
2180 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
2181 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2182 ; AVX1-NEXT: retq
2183 ;
2184 ; AVX2-LABEL: load_cvt_i16_to_f64:
2185 ; AVX2: # BB#0:
2186 ; AVX2-NEXT: movswl (%rdi), %eax
2187 ; AVX2-NEXT: vmovd %eax, %xmm0
2188 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
2189 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2190 ; AVX2-NEXT: retq
2191 ;
2192 ; AVX512F-LABEL: load_cvt_i16_to_f64:
2193 ; AVX512F: # BB#0:
2194 ; AVX512F-NEXT: movswl (%rdi), %eax
2195 ; AVX512F-NEXT: vmovd %eax, %xmm0
2196 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
2197 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2198 ; AVX512F-NEXT: vzeroupper
2199 ; AVX512F-NEXT: retq
2200 ;
2201 ; AVX512VL-LABEL: load_cvt_i16_to_f64:
2202 ; AVX512VL: # BB#0:
2203 ; AVX512VL-NEXT: movswl (%rdi), %eax
2204 ; AVX512VL-NEXT: vmovd %eax, %xmm0
2205 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
2206 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2207 ; AVX512VL-NEXT: retq
1735 ; ALL-LABEL: load_cvt_i16_to_f64:
1736 ; ALL: # BB#0:
1737 ; ALL-NEXT: movswl (%rdi), %eax
1738 ; ALL-NEXT: vmovd %eax, %xmm0
1739 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1740 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1741 ; ALL-NEXT: retq
22081742 %1 = load i16, i16* %a0
22091743 %2 = bitcast i16 %1 to half
22101744 %3 = fpext half %2 to double
22121746 }
22131747
22141748 define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
2215 ; AVX1-LABEL: load_cvt_2i16_to_2f64:
2216 ; AVX1: # BB#0:
2217 ; AVX1-NEXT: movswl (%rdi), %eax
2218 ; AVX1-NEXT: vmovd %eax, %xmm0
2219 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
2220 ; AVX1-NEXT: movswl 2(%rdi), %eax
2221 ; AVX1-NEXT: vmovd %eax, %xmm1
2222 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
2223 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2224 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2225 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2226 ; AVX1-NEXT: retq
2227 ;
2228 ; AVX2-LABEL: load_cvt_2i16_to_2f64:
2229 ; AVX2: # BB#0:
2230 ; AVX2-NEXT: movswl (%rdi), %eax
2231 ; AVX2-NEXT: vmovd %eax, %xmm0
2232 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
2233 ; AVX2-NEXT: movswl 2(%rdi), %eax
2234 ; AVX2-NEXT: vmovd %eax, %xmm1
2235 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
2236 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2237 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2238 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2239 ; AVX2-NEXT: retq
2240 ;
2241 ; AVX512F-LABEL: load_cvt_2i16_to_2f64:
2242 ; AVX512F: # BB#0:
2243 ; AVX512F-NEXT: movswl (%rdi), %eax
2244 ; AVX512F-NEXT: vmovd %eax, %xmm0
2245 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
2246 ; AVX512F-NEXT: movswl 2(%rdi), %eax
2247 ; AVX512F-NEXT: vmovd %eax, %xmm1
2248 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
2249 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2250 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2251 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2252 ; AVX512F-NEXT: vzeroupper
2253 ; AVX512F-NEXT: retq
2254 ;
2255 ; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
2256 ; AVX512VL: # BB#0:
2257 ; AVX512VL-NEXT: movswl (%rdi), %eax
2258 ; AVX512VL-NEXT: vmovd %eax, %xmm0
2259 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
2260 ; AVX512VL-NEXT: movswl 2(%rdi), %eax
2261 ; AVX512VL-NEXT: vmovd %eax, %xmm1
2262 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
2263 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2264 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2265 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2266 ; AVX512VL-NEXT: retq
1749 ; ALL-LABEL: load_cvt_2i16_to_2f64:
1750 ; ALL: # BB#0:
1751 ; ALL-NEXT: movswl (%rdi), %eax
1752 ; ALL-NEXT: vmovd %eax, %xmm0
1753 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1754 ; ALL-NEXT: movswl 2(%rdi), %eax
1755 ; ALL-NEXT: vmovd %eax, %xmm1
1756 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1757 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1758 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1759 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1760 ; ALL-NEXT: retq
22671761 %1 = load <2 x i16>, <2 x i16>* %a0
22681762 %2 = bitcast <2 x i16> %1 to <2 x half>
22691763 %3 = fpext <2 x half> %2 to <2 x double>
22711765 }
22721766
22731767 define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
2274 ; AVX1-LABEL: load_cvt_4i16_to_4f64:
2275 ; AVX1: # BB#0:
2276 ; AVX1-NEXT: movswl (%rdi), %eax
2277 ; AVX1-NEXT: vmovd %eax, %xmm0
2278 ; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
2279 ; AVX1-NEXT: movswl 2(%rdi), %eax
2280 ; AVX1-NEXT: vmovd %eax, %xmm1
2281 ; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
2282 ; AVX1-NEXT: movswl 4(%rdi), %eax
2283 ; AVX1-NEXT: vmovd %eax, %xmm2
2284 ; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
2285 ; AVX1-NEXT: movswl 6(%rdi), %eax
2286 ; AVX1-NEXT: vmovd %eax, %xmm3
2287 ; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
2288 ; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2289 ; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2290 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2291 ; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2292 ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2293 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2294 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2295 ; AVX1-NEXT: retq
2296 ;
2297 ; AVX2-LABEL: load_cvt_4i16_to_4f64:
2298 ; AVX2: # BB#0:
2299 ; AVX2-NEXT: movswl (%rdi), %eax
2300 ; AVX2-NEXT: vmovd %eax, %xmm0
2301 ; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
2302 ; AVX2-NEXT: movswl 2(%rdi), %eax
2303 ; AVX2-NEXT: vmovd %eax, %xmm1
2304 ; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
2305 ; AVX2-NEXT: movswl 4(%rdi), %eax
2306 ; AVX2-NEXT: vmovd %eax, %xmm2
2307 ; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
2308 ; AVX2-NEXT: movswl 6(%rdi), %eax
2309 ; AVX2-NEXT: vmovd %eax, %xmm3
2310 ; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
2311 ; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2312 ; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2313 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2314 ; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2315 ; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2316 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2317 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2318 ; AVX2-NEXT: retq
2319 ;
2320 ; AVX512F-LABEL: load_cvt_4i16_to_4f64:
2321 ; AVX512F: # BB#0:
2322 ; AVX512F-NEXT: movswl (%rdi), %eax
2323 ; AVX512F-NEXT: vmovd %eax, %xmm0
2324 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
2325 ; AVX512F-NEXT: movswl 2(%rdi), %eax
2326 ; AVX512F-NEXT: vmovd %eax, %xmm1
2327 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
2328 ; AVX512F-NEXT: movswl 4(%rdi), %eax
2329 ; AVX512F-NEXT: vmovd %eax, %xmm2
2330 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
2331 ; AVX512F-NEXT: movswl 6(%rdi), %eax
2332 ; AVX512F-NEXT: vmovd %eax, %xmm3
2333 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
2334 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2335 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2336 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2337 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2338 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2339 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2340 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2341 ; AVX512F-NEXT: retq
2342 ;
2343 ; AVX512VL-LABEL: load_cvt_4i16_to_4f64:
2344 ; AVX512VL: # BB#0:
2345 ; AVX512VL-NEXT: movswl (%rdi), %eax
2346 ; AVX512VL-NEXT: vmovd %eax, %xmm0
2347 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
2348 ; AVX512VL-NEXT: movswl 2(%rdi), %eax
2349 ; AVX512VL-NEXT: vmovd %eax, %xmm1
2350 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
2351 ; AVX512VL-NEXT: movswl 4(%rdi), %eax
2352 ; AVX512VL-NEXT: vmovd %eax, %xmm2
2353 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
2354 ; AVX512VL-NEXT: movswl 6(%rdi), %eax
2355 ; AVX512VL-NEXT: vmovd %eax, %xmm3
2356 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
2357 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2358 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2359 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2360 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2361 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2362 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2363 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2364 ; AVX512VL-NEXT: retq
1768 ; ALL-LABEL: load_cvt_4i16_to_4f64:
1769 ; ALL: # BB#0:
1770 ; ALL-NEXT: movswl (%rdi), %eax
1771 ; ALL-NEXT: vmovd %eax, %xmm0
1772 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
1773 ; ALL-NEXT: movswl 2(%rdi), %eax
1774 ; ALL-NEXT: vmovd %eax, %xmm1
1775 ; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
1776 ; ALL-NEXT: movswl 4(%rdi), %eax
1777 ; ALL-NEXT: vmovd %eax, %xmm2
1778 ; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
1779 ; ALL-NEXT: movswl 6(%rdi), %eax
1780 ; ALL-NEXT: vmovd %eax, %xmm3
1781 ; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
1782 ; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
1783 ; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
1784 ; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1785 ; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
1786 ; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
1787 ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1788 ; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1789 ; ALL-NEXT: retq
23651790 %1 = load <4 x i16>, <4 x i16>* %a0
23661791 %2 = bitcast <4 x i16> %1 to <4 x half>
23671792 %3 = fpext <4 x half> %2 to <4 x double>
24381863 ; AVX512F-NEXT: shrl $16, %edx
24391864 ; AVX512F-NEXT: movswl %dx, %edx
24401865 ; AVX512F-NEXT: vmovd %edx, %xmm0
2441 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
1866 ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
24421867 ; AVX512F-NEXT: vmovd %esi, %xmm1
2443 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
1868 ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
24441869 ; AVX512F-NEXT: movswl %cx, %ecx
24451870 ; AVX512F-NEXT: vmovd %ecx, %xmm2
2446 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
1871 ; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
24471872 ; AVX512F-NEXT: cwtl
24481873 ; AVX512F-NEXT: vmovd %eax, %xmm3
2449 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
1874 ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
24501875 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
24511876 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
24521877 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
25782003 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
25792004 ; AVX2-NEXT: retq
25802005 ;
2581 ; AVX512F-LABEL: load_cvt_8i16_to_8f64:
2582 ; AVX512F: # BB#0:
2583 ; AVX512F-NEXT: movswl (%rdi), %eax
2584 ; AVX512F-NEXT: vmovd %eax, %xmm0
2585 ; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
2586 ; AVX512F-NEXT: movswl 2(%rdi), %eax
2587 ; AVX512F-NEXT: vmovd %eax, %xmm1
2588 ; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
2589 ; AVX512F-NEXT: movswl 4(%rdi), %eax
2590 ; AVX512F-NEXT: vmovd %eax, %xmm2
2591 ; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
2592 ; AVX512F-NEXT: movswl 6(%rdi), %eax
2593 ; AVX512F-NEXT: vmovd %eax, %xmm3
2594 ; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
2595 ; AVX512F-NEXT: movswl 8(%rdi), %eax
2596 ; AVX512F-NEXT: vmovd %eax, %xmm4
2597 ; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
2598 ; AVX512F-NEXT: movswl 10(%rdi), %eax
2599 ; AVX512F-NEXT: vmovd %eax, %xmm5
2600 ; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
2601 ; AVX512F-NEXT: movswl 12(%rdi), %eax
2602 ; AVX512F-NEXT: vmovd %eax, %xmm6
2603 ; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
2604 ; AVX512F-NEXT: movswl 14(%rdi), %eax
2605 ; AVX512F-NEXT: vmovd %eax, %xmm7
2606 ; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
2607 ; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
2608 ; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
2609 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2610 ; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
2611 ; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
2612 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
2613 ; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
2614 ; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2615 ; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2616 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2617 ; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2618 ; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2619 ; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2620 ; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2621 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2622 ; AVX512F-NEXT: retq
2623 ;
2624 ; AVX512VL-LABEL: load_cvt_8i16_to_8f64:
2625 ; AVX512VL: # BB#0:
2626 ; AVX512VL-NEXT: movswl (%rdi), %eax
2627 ; AVX512VL-NEXT: vmovd %eax, %xmm0
2628 ; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
2629 ; AVX512VL-NEXT: movswl 2(%rdi), %eax
2630 ; AVX512VL-NEXT: vmovd %eax, %xmm1
2631 ; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
2632 ; AVX512VL-NEXT: movswl 4(%rdi), %eax
2633 ; AVX512VL-NEXT: vmovd %eax, %xmm2
2634 ; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
2635 ; AVX512VL-NEXT: movswl 6(%rdi), %eax
2636 ; AVX512VL-NEXT: vmovd %eax, %xmm3
2637 ; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
2638 ; AVX512VL-NEXT: movswl 8(%rdi), %eax
2639 ; AVX512VL-NEXT: vmovd %eax, %xmm4
2640 ; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
2641 ; AVX512VL-NEXT: movswl 10(%rdi), %eax
2642 ; AVX512VL-NEXT: vmovd %eax, %xmm5
2643 ; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
2644 ; AVX512VL-NEXT: movswl 12(%rdi), %eax
2645 ; AVX512VL-NEXT: vmovd %eax, %xmm6
2646 ; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
2647 ; AVX512VL-NEXT: movswl 14(%rdi), %eax
2648 ; AVX512VL-NEXT: vmovd %eax, %xmm7
2649 ; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
2650 ; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
2651 ; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
2652 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2653 ; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
2654 ; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
2655 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
2656 ; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
2657 ; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2658 ; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2659 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2660 ; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2661 ; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2662 ; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2663 ; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2664 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2665 ; AVX512VL-NEXT: retq
2006 ; AVX512-LABEL: load_cvt_8i16_to_8f64:
2007 ; AVX512: # BB#0:
2008 ; AVX512-NEXT: movswl (%rdi), %eax
2009 ; AVX512-NEXT: vmovd %eax, %xmm0
2010 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
2011 ; AVX512-NEXT: movswl 2(%rdi), %eax
2012 ; AVX512-NEXT: vmovd %eax, %xmm1
2013 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
2014 ; AVX512-NEXT: movswl 4(%rdi), %eax
2015 ; AVX512-NEXT: vmovd %eax, %xmm2
2016 ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
2017 ; AVX512-NEXT: movswl 6(%rdi), %eax
2018 ; AVX512-NEXT: vmovd %eax, %xmm3
2019 ; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
2020 ; AVX512-NEXT: movswl 8(%rdi), %eax
2021 ; AVX512-NEXT: vmovd %eax, %xmm4
2022 ; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
2023 ; AVX512-NEXT: movswl 10(%rdi), %eax
2024 ; AVX512-NEXT: vmovd %eax, %xmm5
2025 ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
2026 ; AVX512-NEXT: movswl 12(%rdi), %eax
2027 ; AVX512-NEXT: vmovd %eax, %xmm6
2028 ; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
2029 ; AVX512-NEXT: movswl 14(%rdi), %eax
2030 ; AVX512-NEXT: vmovd %eax, %xmm7
2031 ; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
2032 ; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
2033 ; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
2034 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
2035 ; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
2036 ; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
2037 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
2038 ; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
2039 ; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
2040 ; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
2041 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2042 ; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
2043 ; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
2044 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2045 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2046 ; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
2047 ; AVX512-NEXT: retq
26662048 %1 = load <8 x i16>, <8 x i16>* %a0
26672049 %2 = bitcast <8 x i16> %1 to <8 x half>
26682050 %3 = fpext <8 x half> %2 to <8 x double>
26742056 ;
26752057
26762058 define i16 @cvt_f32_to_i16(float %a0) nounwind {
2677 ; AVX1-LABEL: cvt_f32_to_i16:
2678 ; AVX1: # BB#0:
2679 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2680 ; AVX1-NEXT: vmovd %xmm0, %eax
2681 ; AVX1-NEXT: # kill: %AX %AX %EAX
2682 ; AVX1-NEXT: retq
2683 ;
2684 ; AVX2-LABEL: cvt_f32_to_i16:
2685 ; AVX2: # BB#0:
2686 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2687 ; AVX2-NEXT: vmovd %xmm0, %eax
2688 ; AVX2-NEXT: # kill: %AX %AX %EAX
2689 ; AVX2-NEXT: retq
2690 ;
2691 ; AVX512F-LABEL: cvt_f32_to_i16:
2692 ; AVX512F: # BB#0:
2693 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
2694 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
2695 ; AVX512F-NEXT: vmovd %xmm0, %eax
2696 ; AVX512F-NEXT: # kill: %AX %AX %EAX
2697 ; AVX512F-NEXT: vzeroupper
2698 ; AVX512F-NEXT: retq
2699 ;
2700 ; AVX512VL-LABEL: cvt_f32_to_i16:
2701 ; AVX512VL: # BB#0:
2702 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2703 ; AVX512VL-NEXT: vmovd %xmm0, %eax
2704 ; AVX512VL-NEXT: # kill: %AX %AX %EAX
2705 ; AVX512VL-NEXT: retq
2059 ; ALL-LABEL: cvt_f32_to_i16:
2060 ; ALL: # BB#0:
2061 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2062 ; ALL-NEXT: vmovd %xmm0, %eax
2063 ; ALL-NEXT: # kill: %AX %AX %EAX
2064 ; ALL-NEXT: retq
27062065 %1 = fptrunc float %a0 to half
27072066 %2 = bitcast half %1 to i16
27082067 ret i16 %2
27092068 }
27102069
27112070 define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
2712 ; AVX1-LABEL: cvt_4f32_to_4i16:
2713 ; AVX1: # BB#0:
2714 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2715 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2716 ; AVX1-NEXT: vmovd %xmm1, %eax
2717 ; AVX1-NEXT: shll $16, %eax
2718 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2719 ; AVX1-NEXT: vmovd %xmm1, %ecx
2720 ; AVX1-NEXT: movzwl %cx, %ecx
2721 ; AVX1-NEXT: orl %eax, %ecx
2722 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2723 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2724 ; AVX1-NEXT: vmovd %xmm1, %eax
2725 ; AVX1-NEXT: shll $16, %eax
2726 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2727 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2728 ; AVX1-NEXT: vmovd %xmm0, %edx
2729 ; AVX1-NEXT: movzwl %dx, %edx
2730 ; AVX1-NEXT: orl %eax, %edx
2731 ; AVX1-NEXT: shlq $32, %rdx
2732 ; AVX1-NEXT: orq %rcx, %rdx
2733 ; AVX1-NEXT: vmovq %rdx, %xmm0
2734 ; AVX1-NEXT: retq
2735 ;
2736 ; AVX2-LABEL: cvt_4f32_to_4i16:
2737 ; AVX2: # BB#0:
2738 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2739 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2740 ; AVX2-NEXT: vmovd %xmm1, %eax
2741 ; AVX2-NEXT: shll $16, %eax
2742 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2743 ; AVX2-NEXT: vmovd %xmm1, %ecx
2744 ; AVX2-NEXT: movzwl %cx, %ecx
2745 ; AVX2-NEXT: orl %eax, %ecx
2746 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2747 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2748 ; AVX2-NEXT: vmovd %xmm1, %eax
2749 ; AVX2-NEXT: shll $16, %eax
2750 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2751 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2752 ; AVX2-NEXT: vmovd %xmm0, %edx
2753 ; AVX2-NEXT: movzwl %dx, %edx
2754 ; AVX2-NEXT: orl %eax, %edx
2755 ; AVX2-NEXT: shlq $32, %rdx
2756 ; AVX2-NEXT: orq %rcx, %rdx
2757 ; AVX2-NEXT: vmovq %rdx, %xmm0
2758 ; AVX2-NEXT: retq
2759 ;
2760 ; AVX512F-LABEL: cvt_4f32_to_4i16:
2761 ; AVX512F: # BB#0:
2762 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
2763 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
2764 ; AVX512F-NEXT: vmovd %xmm1, %eax
2765 ; AVX512F-NEXT: movzwl %ax, %eax
2766 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2767 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2768 ; AVX512F-NEXT: vmovd %xmm1, %ecx
2769 ; AVX512F-NEXT: shll $16, %ecx
2770 ; AVX512F-NEXT: orl %eax, %ecx
2771 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2772 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2773 ; AVX512F-NEXT: vmovd %xmm1, %eax
2774 ; AVX512F-NEXT: movzwl %ax, %eax
2775 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2776 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
2777 ; AVX512F-NEXT: vmovd %xmm0, %edx
2778 ; AVX512F-NEXT: shll $16, %edx
2779 ; AVX512F-NEXT: orl %eax, %edx
2780 ; AVX512F-NEXT: shlq $32, %rdx
2781 ; AVX512F-NEXT: orq %rcx, %rdx
2782 ; AVX512F-NEXT: vmovq %rdx, %xmm0
2783 ; AVX512F-NEXT: vzeroupper
2784 ; AVX512F-NEXT: retq
2785 ;
2786 ; AVX512VL-LABEL: cvt_4f32_to_4i16:
2787 ; AVX512VL: # BB#0:
2788 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2789 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2790 ; AVX512VL-NEXT: vmovd %xmm1, %eax
2791 ; AVX512VL-NEXT: shll $16, %eax
2792 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2793 ; AVX512VL-NEXT: vmovd %xmm1, %ecx
2794 ; AVX512VL-NEXT: movzwl %cx, %ecx
2795 ; AVX512VL-NEXT: orl %eax, %ecx
2796 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2797 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2798 ; AVX512VL-NEXT: vmovd %xmm1, %eax
2799 ; AVX512VL-NEXT: shll $16, %eax
2800 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2801 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2802 ; AVX512VL-NEXT: vmovd %xmm0, %edx
2803 ; AVX512VL-NEXT: movzwl %dx, %edx
2804 ; AVX512VL-NEXT: orl %eax, %edx
2805 ; AVX512VL-NEXT: shlq $32, %rdx
2806 ; AVX512VL-NEXT: orq %rcx, %rdx
2807 ; AVX512VL-NEXT: vmovq %rdx, %xmm0
2808 ; AVX512VL-NEXT: retq
2071 ; ALL-LABEL: cvt_4f32_to_4i16:
2072 ; ALL: # BB#0:
2073 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2074 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2075 ; ALL-NEXT: vmovd %xmm1, %eax
2076 ; ALL-NEXT: shll $16, %eax
2077 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2078 ; ALL-NEXT: vmovd %xmm1, %ecx
2079 ; ALL-NEXT: movzwl %cx, %ecx
2080 ; ALL-NEXT: orl %eax, %ecx
2081 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2082 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2083 ; ALL-NEXT: vmovd %xmm1, %eax
2084 ; ALL-NEXT: shll $16, %eax
2085 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2086 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2087 ; ALL-NEXT: vmovd %xmm0, %edx
2088 ; ALL-NEXT: movzwl %dx, %edx
2089 ; ALL-NEXT: orl %eax, %edx
2090 ; ALL-NEXT: shlq $32, %rdx
2091 ; ALL-NEXT: orq %rcx, %rdx
2092 ; ALL-NEXT: vmovq %rdx, %xmm0
2093 ; ALL-NEXT: retq
28092094 %1 = fptrunc <4 x float> %a0 to <4 x half>
28102095 %2 = bitcast <4 x half> %1 to <4 x i16>
28112096 ret <4 x i16> %2
28642149 ;
28652150 ; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
28662151 ; AVX512F: # BB#0:
2867 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
2868 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
2152 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2153 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
28692154 ; AVX512F-NEXT: vmovd %xmm1, %eax
2870 ; AVX512F-NEXT: movzwl %ax, %eax
2871 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2872 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2155 ; AVX512F-NEXT: shll $16, %eax
2156 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
28732157 ; AVX512F-NEXT: vmovd %xmm1, %ecx
2874 ; AVX512F-NEXT: shll $16, %ecx
2158 ; AVX512F-NEXT: movzwl %cx, %ecx
28752159 ; AVX512F-NEXT: orl %eax, %ecx
2876 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2877 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2160 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2161 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
28782162 ; AVX512F-NEXT: vmovd %xmm1, %eax
2879 ; AVX512F-NEXT: movzwl %ax, %eax
2880 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2881 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
2163 ; AVX512F-NEXT: shll $16, %eax
2164 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2165 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
28822166 ; AVX512F-NEXT: vmovd %xmm0, %edx
2883 ; AVX512F-NEXT: shll $16, %edx
2167 ; AVX512F-NEXT: movzwl %dx, %edx
28842168 ; AVX512F-NEXT: orl %eax, %edx
28852169 ; AVX512F-NEXT: shlq $32, %rdx
28862170 ; AVX512F-NEXT: orq %rcx, %rdx
28872171 ; AVX512F-NEXT: vmovq %rdx, %xmm0
28882172 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2889 ; AVX512F-NEXT: vzeroupper
28902173 ; AVX512F-NEXT: retq
28912174 ;
28922175 ; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
29732256 ;
29742257 ; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
29752258 ; AVX512F: # BB#0:
2976 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
2977 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
2259 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2260 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
29782261 ; AVX512F-NEXT: vmovd %xmm1, %eax
2979 ; AVX512F-NEXT: movzwl %ax, %eax
2980 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2981 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2262 ; AVX512F-NEXT: shll $16, %eax
2263 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
29822264 ; AVX512F-NEXT: vmovd %xmm1, %ecx
2983 ; AVX512F-NEXT: shll $16, %ecx
2265 ; AVX512F-NEXT: movzwl %cx, %ecx
29842266 ; AVX512F-NEXT: orl %eax, %ecx
2985 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2986 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2267 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2268 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
29872269 ; AVX512F-NEXT: vmovd %xmm1, %eax
2988 ; AVX512F-NEXT: movzwl %ax, %eax
2989 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2990 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
2270 ; AVX512F-NEXT: shll $16, %eax
2271 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2272 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
29912273 ; AVX512F-NEXT: vmovd %xmm0, %edx
2992 ; AVX512F-NEXT: shll $16, %edx
2274 ; AVX512F-NEXT: movzwl %dx, %edx
29932275 ; AVX512F-NEXT: orl %eax, %edx
29942276 ; AVX512F-NEXT: shlq $32, %rdx
29952277 ; AVX512F-NEXT: orq %rcx, %rdx
29962278 ; AVX512F-NEXT: vmovq %rdx, %xmm0
29972279 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
2998 ; AVX512F-NEXT: vzeroupper
29992280 ; AVX512F-NEXT: retq
30002281 ;
30012282 ; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
30322313 }
30332314
30342315 define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
3035 ; AVX1-LABEL: cvt_8f32_to_8i16:
3036 ; AVX1: # BB#0:
3037 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3038 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3039 ; AVX1-NEXT: vmovd %xmm1, %eax
3040 ; AVX1-NEXT: shll $16, %eax
3041 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
3042 ; AVX1-NEXT: vmovd %xmm1, %ecx
3043 ; AVX1-NEXT: movzwl %cx, %ecx
3044 ; AVX1-NEXT: orl %eax, %ecx
3045 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3046 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3047 ; AVX1-NEXT: vmovd %xmm1, %edx
3048 ; AVX1-NEXT: shll $16, %edx
3049 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3050 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3051 ; AVX1-NEXT: vmovd %xmm1, %eax
3052 ; AVX1-NEXT: movzwl %ax, %eax
3053 ; AVX1-NEXT: orl %edx, %eax
3054 ; AVX1-NEXT: shlq $32, %rax
3055 ; AVX1-NEXT: orq %rcx, %rax
3056 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3057 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3058 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3059 ; AVX1-NEXT: vmovd %xmm1, %ecx
3060 ; AVX1-NEXT: shll $16, %ecx
3061 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
3062 ; AVX1-NEXT: vmovd %xmm1, %edx
3063 ; AVX1-NEXT: movzwl %dx, %edx
3064 ; AVX1-NEXT: orl %ecx, %edx
3065 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3066 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3067 ; AVX1-NEXT: vmovd %xmm1, %ecx
3068 ; AVX1-NEXT: shll $16, %ecx
3069 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3070 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3071 ; AVX1-NEXT: vmovd %xmm0, %esi
3072 ; AVX1-NEXT: movzwl %si, %esi
3073 ; AVX1-NEXT: orl %ecx, %esi
3074 ; AVX1-NEXT: shlq $32, %rsi
3075 ; AVX1-NEXT: orq %rdx, %rsi
3076 ; AVX1-NEXT: vmovq %rsi, %xmm0
3077 ; AVX1-NEXT: vmovq %rax, %xmm1
3078 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3079 ; AVX1-NEXT: vzeroupper
3080 ; AVX1-NEXT: retq
3081 ;
3082 ; AVX2-LABEL: cvt_8f32_to_8i16:
3083 ; AVX2: # BB#0:
3084 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3085 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3086 ; AVX2-NEXT: vmovd %xmm1, %eax
3087 ; AVX2-NEXT: shll $16, %eax
3088 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
3089 ; AVX2-NEXT: vmovd %xmm1, %ecx
3090 ; AVX2-NEXT: movzwl %cx, %ecx
3091 ; AVX2-NEXT: orl %eax, %ecx
3092 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3093 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3094 ; AVX2-NEXT: vmovd %xmm1, %edx
3095 ; AVX2-NEXT: shll $16, %edx
3096 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3097 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3098 ; AVX2-NEXT: vmovd %xmm1, %eax
3099 ; AVX2-NEXT: movzwl %ax, %eax
3100 ; AVX2-NEXT: orl %edx, %eax
3101 ; AVX2-NEXT: shlq $32, %rax
3102 ; AVX2-NEXT: orq %rcx, %rax
3103 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
3104 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3105 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3106 ; AVX2-NEXT: vmovd %xmm1, %ecx
3107 ; AVX2-NEXT: shll $16, %ecx
3108 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
3109 ; AVX2-NEXT: vmovd %xmm1, %edx
3110 ; AVX2-NEXT: movzwl %dx, %edx
3111 ; AVX2-NEXT: orl %ecx, %edx
3112 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3113 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3114 ; AVX2-NEXT: vmovd %xmm1, %ecx
3115 ; AVX2-NEXT: shll $16, %ecx
3116 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3117 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3118 ; AVX2-NEXT: vmovd %xmm0, %esi
3119 ; AVX2-NEXT: movzwl %si, %esi
3120 ; AVX2-NEXT: orl %ecx, %esi
3121 ; AVX2-NEXT: shlq $32, %rsi
3122 ; AVX2-NEXT: orq %rdx, %rsi
3123 ; AVX2-NEXT: vmovq %rsi, %xmm0
3124 ; AVX2-NEXT: vmovq %rax, %xmm1
3125 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3126 ; AVX2-NEXT: vzeroupper
3127 ; AVX2-NEXT: retq
3128 ;
3129 ; AVX512F-LABEL: cvt_8f32_to_8i16:
3130 ; AVX512F: # BB#0:
3131 ; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0
3132 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
3133 ; AVX512F-NEXT: vmovd %xmm1, %eax
3134 ; AVX512F-NEXT: movzwl %ax, %eax
3135 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3136 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3137 ; AVX512F-NEXT: vmovd %xmm1, %ecx
3138 ; AVX512F-NEXT: shll $16, %ecx
3139 ; AVX512F-NEXT: orl %eax, %ecx
3140 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3141 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3142 ; AVX512F-NEXT: vmovd %xmm1, %eax
3143 ; AVX512F-NEXT: movzwl %ax, %edx
3144 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3145 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3146 ; AVX512F-NEXT: vmovd %xmm1, %eax
3147 ; AVX512F-NEXT: shll $16, %eax
3148 ; AVX512F-NEXT: orl %edx, %eax
3149 ; AVX512F-NEXT: shlq $32, %rax
3150 ; AVX512F-NEXT: orq %rcx, %rax
3151 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
3152 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
3153 ; AVX512F-NEXT: vmovd %xmm1, %ecx
3154 ; AVX512F-NEXT: movzwl %cx, %ecx
3155 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3156 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3157 ; AVX512F-NEXT: vmovd %xmm1, %edx
3158 ; AVX512F-NEXT: shll $16, %edx
3159 ; AVX512F-NEXT: orl %ecx, %edx
3160 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3161 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3162 ; AVX512F-NEXT: vmovd %xmm1, %ecx
3163 ; AVX512F-NEXT: movzwl %cx, %ecx
3164 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3165 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
3166 ; AVX512F-NEXT: vmovd %xmm0, %esi
3167 ; AVX512F-NEXT: shll $16, %esi
3168 ; AVX512F-NEXT: orl %ecx, %esi
3169 ; AVX512F-NEXT: shlq $32, %rsi
3170 ; AVX512F-NEXT: orq %rdx, %rsi
3171 ; AVX512F-NEXT: vmovq %rsi, %xmm0
3172 ; AVX512F-NEXT: vmovq %rax, %xmm1
3173 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3174 ; AVX512F-NEXT: vzeroupper
3175 ; AVX512F-NEXT: retq
3176 ;
3177 ; AVX512VL-LABEL: cvt_8f32_to_8i16:
3178 ; AVX512VL: # BB#0:
3179 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3180 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3181 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3182 ; AVX512VL-NEXT: shll $16, %eax
3183 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
3184 ; AVX512VL-NEXT: vmovd %xmm1, %ecx
3185 ; AVX512VL-NEXT: movzwl %cx, %ecx
3186 ; AVX512VL-NEXT: orl %eax, %ecx
3187 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3188 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3189 ; AVX512VL-NEXT: vmovd %xmm1, %edx
3190 ; AVX512VL-NEXT: shll $16, %edx
3191 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3192 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3193 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3194 ; AVX512VL-NEXT: movzwl %ax, %eax
3195 ; AVX512VL-NEXT: orl %edx, %eax
3196 ; AVX512VL-NEXT: shlq $32, %rax
3197 ; AVX512VL-NEXT: orq %rcx, %rax
3198 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
3199 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3200 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3201 ; AVX512VL-NEXT: vmovd %xmm1, %ecx
3202 ; AVX512VL-NEXT: shll $16, %ecx
3203 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
3204 ; AVX512VL-NEXT: vmovd %xmm1, %edx
3205 ; AVX512VL-NEXT: movzwl %dx, %edx
3206 ; AVX512VL-NEXT: orl %ecx, %edx
3207 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3208 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3209 ; AVX512VL-NEXT: vmovd %xmm1, %ecx
3210 ; AVX512VL-NEXT: shll $16, %ecx
3211 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3212 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3213 ; AVX512VL-NEXT: vmovd %xmm0, %esi
3214 ; AVX512VL-NEXT: movzwl %si, %esi
3215 ; AVX512VL-NEXT: orl %ecx, %esi
3216 ; AVX512VL-NEXT: shlq $32, %rsi
3217 ; AVX512VL-NEXT: orq %rdx, %rsi
3218 ; AVX512VL-NEXT: vmovq %rsi, %xmm0
3219 ; AVX512VL-NEXT: vmovq %rax, %xmm1
3220 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
3221 ; AVX512VL-NEXT: vzeroupper
3222 ; AVX512VL-NEXT: retq
2316 ; ALL-LABEL: cvt_8f32_to_8i16:
2317 ; ALL: # BB#0:
2318 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2319 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2320 ; ALL-NEXT: vmovd %xmm1, %eax
2321 ; ALL-NEXT: shll $16, %eax
2322 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2323 ; ALL-NEXT: vmovd %xmm1, %ecx
2324 ; ALL-NEXT: movzwl %cx, %ecx
2325 ; ALL-NEXT: orl %eax, %ecx
2326 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2327 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2328 ; ALL-NEXT: vmovd %xmm1, %edx
2329 ; ALL-NEXT: shll $16, %edx
2330 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2331 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2332 ; ALL-NEXT: vmovd %xmm1, %eax
2333 ; ALL-NEXT: movzwl %ax, %eax
2334 ; ALL-NEXT: orl %edx, %eax
2335 ; ALL-NEXT: shlq $32, %rax
2336 ; ALL-NEXT: orq %rcx, %rax
2337 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2338 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2339 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2340 ; ALL-NEXT: vmovd %xmm1, %ecx
2341 ; ALL-NEXT: shll $16, %ecx
2342 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2343 ; ALL-NEXT: vmovd %xmm1, %edx
2344 ; ALL-NEXT: movzwl %dx, %edx
2345 ; ALL-NEXT: orl %ecx, %edx
2346 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2347 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2348 ; ALL-NEXT: vmovd %xmm1, %ecx
2349 ; ALL-NEXT: shll $16, %ecx
2350 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2351 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2352 ; ALL-NEXT: vmovd %xmm0, %esi
2353 ; ALL-NEXT: movzwl %si, %esi
2354 ; ALL-NEXT: orl %ecx, %esi
2355 ; ALL-NEXT: shlq $32, %rsi
2356 ; ALL-NEXT: orq %rdx, %rsi
2357 ; ALL-NEXT: vmovq %rsi, %xmm0
2358 ; ALL-NEXT: vmovq %rax, %xmm1
2359 ; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2360 ; ALL-NEXT: vzeroupper
2361 ; ALL-NEXT: retq
32232362 %1 = fptrunc <8 x float> %a0 to <8 x half>
32242363 %2 = bitcast <8 x half> %1 to <8 x i16>
32252364 ret <8 x i16> %2
33602499 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
33612500 ; AVX2-NEXT: retq
33622501 ;
3363 ; AVX512F-LABEL: cvt_16f32_to_16i16:
3364 ; AVX512F: # BB#0:
3365 ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
3366 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm2
3367 ; AVX512F-NEXT: vmovd %xmm2, %eax
3368 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3369 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
3370 ; AVX512F-NEXT: vmovd %eax, %xmm3
3371 ; AVX512F-NEXT: vmovd %xmm2, %eax
3372 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3373 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
3374 ; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
3375 ; AVX512F-NEXT: vmovd %xmm2, %eax
3376 ; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm2
3377 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
3378 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3379 ; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
3380 ; AVX512F-NEXT: vmovd %xmm1, %eax
3381 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm1
3382 ; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
3383 ; AVX512F-NEXT: vmovd %xmm1, %eax
3384 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
3385 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3386 ; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
3387 ; AVX512F-NEXT: vmovd %xmm1, %eax
3388 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
3389 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3390 ; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
3391 ; AVX512F-NEXT: vmovd %xmm1, %eax
3392 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
3393 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
3394 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
3395 ; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
3396 ; AVX512F-NEXT: vmovd %xmm2, %eax
3397 ; AVX512F-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
3398 ; AVX512F-NEXT: vmovd %xmm1, %eax
3399 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3400 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3401 ; AVX512F-NEXT: vmovd %eax, %xmm3
3402 ; AVX512F-NEXT: vmovd %xmm1, %eax
3403 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3404 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3405 ; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
3406 ; AVX512F-NEXT: vmovd %xmm1, %eax
3407 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
3408 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3409 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
3410 ; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
3411 ; AVX512F-NEXT: vmovd %xmm0, %eax
3412 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
3413 ; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
3414 ; AVX512F-NEXT: vmovd %xmm0, %eax
3415 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
3416 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
3417 ; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
3418 ; AVX512F-NEXT: vmovd %xmm0, %eax
3419 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
3420 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
3421 ; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
3422 ; AVX512F-NEXT: vmovd %xmm0, %eax
3423 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
3424 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
3425 ; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
3426 ; AVX512F-NEXT: vmovd %xmm0, %eax
3427 ; AVX512F-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
3428 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
3429 ; AVX512F-NEXT: retq
3430 ;
3431 ; AVX512VL-LABEL: cvt_16f32_to_16i16:
3432 ; AVX512VL: # BB#0:
3433 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
3434 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm2
3435 ; AVX512VL-NEXT: vmovd %xmm2, %eax
3436 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3437 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3438 ; AVX512VL-NEXT: vmovd %eax, %xmm3
3439 ; AVX512VL-NEXT: vmovd %xmm2, %eax
3440 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3441 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3442 ; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
3443 ; AVX512VL-NEXT: vmovd %xmm2, %eax
3444 ; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2
3445 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
3446 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3447 ; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
3448 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3449 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm1
3450 ; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
3451 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3452 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
3453 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3454 ; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
3455 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3456 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
3457 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3458 ; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
3459 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3460 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
3461 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
3462 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3463 ; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
3464 ; AVX512VL-NEXT: vmovd %xmm2, %eax
3465 ; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
3466 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3467 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3468 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3469 ; AVX512VL-NEXT: vmovd %eax, %xmm3
3470 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3471 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3472 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3473 ; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
3474 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3475 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
3476 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3477 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3478 ; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
3479 ; AVX512VL-NEXT: vmovd %xmm0, %eax
3480 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
3481 ; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
3482 ; AVX512VL-NEXT: vmovd %xmm0, %eax
3483 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
3484 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3485 ; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
3486 ; AVX512VL-NEXT: vmovd %xmm0, %eax
3487 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
3488 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3489 ; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
3490 ; AVX512VL-NEXT: vmovd %xmm0, %eax
3491 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
3492 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3493 ; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
3494 ; AVX512VL-NEXT: vmovd %xmm0, %eax
3495 ; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
3496 ; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
3497 ; AVX512VL-NEXT: retq
2502 ; AVX512-LABEL: cvt_16f32_to_16i16:
2503 ; AVX512: # BB#0:
2504 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
2505 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
2506 ; AVX512-NEXT: vmovd %xmm2, %eax
2507 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2508 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2509 ; AVX512-NEXT: vmovd %eax, %xmm3
2510 ; AVX512-NEXT: vmovd %xmm2, %eax
2511 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2512 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2513 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
2514 ; AVX512-NEXT: vmovd %xmm2, %eax
2515 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
2516 ; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
2517 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2518 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
2519 ; AVX512-NEXT: vmovd %xmm1, %eax
2520 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
2521 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
2522 ; AVX512-NEXT: vmovd %xmm1, %eax
2523 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
2524 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2525 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
2526 ; AVX512-NEXT: vmovd %xmm1, %eax
2527 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
2528 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2529 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
2530 ; AVX512-NEXT: vmovd %xmm1, %eax
2531 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2532 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
2533 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2534 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
2535 ; AVX512-NEXT: vmovd %xmm2, %eax
2536 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
2537 ; AVX512-NEXT: vmovd %xmm1, %eax
2538 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2539 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2540 ; AVX512-NEXT: vmovd %eax, %xmm3
2541 ; AVX512-NEXT: vmovd %xmm1, %eax
2542 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2543 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2544 ; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
2545 ; AVX512-NEXT: vmovd %xmm1, %eax
2546 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
2547 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
2548 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2549 ; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
2550 ; AVX512-NEXT: vmovd %xmm0, %eax
2551 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
2552 ; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
2553 ; AVX512-NEXT: vmovd %xmm0, %eax
2554 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2555 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2556 ; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
2557 ; AVX512-NEXT: vmovd %xmm0, %eax
2558 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
2559 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2560 ; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
2561 ; AVX512-NEXT: vmovd %xmm0, %eax
2562 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
2563 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2564 ; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
2565 ; AVX512-NEXT: vmovd %xmm0, %eax
2566 ; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
2567 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2568 ; AVX512-NEXT: retq
34982569 %1 = fptrunc <16 x float> %a0 to <16 x half>
34992570 %2 = bitcast <16 x half> %1 to <16 x i16>
35002571 ret <16 x i16> %2
35052576 ;
35062577
35072578 define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
3508 ; AVX1-LABEL: store_cvt_f32_to_i16:
3509 ; AVX1: # BB#0:
3510 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3511 ; AVX1-NEXT: vmovd %xmm0, %eax
3512 ; AVX1-NEXT: movw %ax, (%rdi)
3513 ; AVX1-NEXT: retq
3514 ;
3515 ; AVX2-LABEL: store_cvt_f32_to_i16:
3516 ; AVX2: # BB#0:
3517 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3518 ; AVX2-NEXT: vmovd %xmm0, %eax
3519 ; AVX2-NEXT: movw %ax, (%rdi)
3520 ; AVX2-NEXT: retq
3521 ;
3522 ; AVX512F-LABEL: store_cvt_f32_to_i16:
3523 ; AVX512F: # BB#0:
3524 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
3525 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
3526 ; AVX512F-NEXT: vmovd %xmm0, %eax
3527 ; AVX512F-NEXT: movw %ax, (%rdi)
3528 ; AVX512F-NEXT: vzeroupper
3529 ; AVX512F-NEXT: retq
3530 ;
3531 ; AVX512VL-LABEL: store_cvt_f32_to_i16:
3532 ; AVX512VL: # BB#0:
3533 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3534 ; AVX512VL-NEXT: vmovd %xmm0, %eax
3535 ; AVX512VL-NEXT: movw %ax, (%rdi)
3536 ; AVX512VL-NEXT: retq
2579 ; ALL-LABEL: store_cvt_f32_to_i16:
2580 ; ALL: # BB#0:
2581 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2582 ; ALL-NEXT: vmovd %xmm0, %eax
2583 ; ALL-NEXT: movw %ax, (%rdi)
2584 ; ALL-NEXT: retq
35372585 %1 = fptrunc float %a0 to half
35382586 %2 = bitcast half %1 to i16
35392587 store i16 %2, i16* %a1
35412589 }
35422590
35432591 define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
3544 ; AVX1-LABEL: store_cvt_4f32_to_4i16:
3545 ; AVX1: # BB#0:
3546 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3547 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3548 ; AVX1-NEXT: vmovd %xmm1, %eax
3549 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3550 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3551 ; AVX1-NEXT: vmovd %xmm1, %ecx
3552 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3553 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3554 ; AVX1-NEXT: vmovd %xmm1, %edx
3555 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3556 ; AVX1-NEXT: vmovd %xmm0, %esi
3557 ; AVX1-NEXT: movw %si, (%rdi)
3558 ; AVX1-NEXT: movw %dx, 6(%rdi)
3559 ; AVX1-NEXT: movw %cx, 4(%rdi)
3560 ; AVX1-NEXT: movw %ax, 2(%rdi)
3561 ; AVX1-NEXT: retq
3562 ;
3563 ; AVX2-LABEL: store_cvt_4f32_to_4i16:
3564 ; AVX2: # BB#0:
3565 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3566 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3567 ; AVX2-NEXT: vmovd %xmm1, %eax
3568 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3569 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3570 ; AVX2-NEXT: vmovd %xmm1, %ecx
3571 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3572 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3573 ; AVX2-NEXT: vmovd %xmm1, %edx
3574 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3575 ; AVX2-NEXT: vmovd %xmm0, %esi
3576 ; AVX2-NEXT: movw %si, (%rdi)
3577 ; AVX2-NEXT: movw %dx, 6(%rdi)
3578 ; AVX2-NEXT: movw %cx, 4(%rdi)
3579 ; AVX2-NEXT: movw %ax, 2(%rdi)
3580 ; AVX2-NEXT: retq
3581 ;
3582 ; AVX512F-LABEL: store_cvt_4f32_to_4i16:
3583 ; AVX512F: # BB#0:
3584 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
3585 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3586 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3587 ; AVX512F-NEXT: vmovd %xmm1, %eax
3588 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3589 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3590 ; AVX512F-NEXT: vmovd %xmm1, %ecx
3591 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3592 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3593 ; AVX512F-NEXT: vmovd %xmm1, %edx
3594 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
3595 ; AVX512F-NEXT: vmovd %xmm0, %esi
3596 ; AVX512F-NEXT: movw %si, (%rdi)
3597 ; AVX512F-NEXT: movw %dx, 6(%rdi)
3598 ; AVX512F-NEXT: movw %cx, 4(%rdi)
3599 ; AVX512F-NEXT: movw %ax, 2(%rdi)
3600 ; AVX512F-NEXT: vzeroupper
3601 ; AVX512F-NEXT: retq
3602 ;
3603 ; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
3604 ; AVX512VL: # BB#0:
3605 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3606 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3607 ; AVX512VL-NEXT: vmovd %xmm1, %eax
3608 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3609 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3610 ; AVX512VL-NEXT: vmovd %xmm1, %ecx
3611 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3612 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3613 ; AVX512VL-NEXT: vmovd %xmm1, %edx
3614 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3615 ; AVX512VL-NEXT: vmovd %xmm0, %esi
3616 ; AVX512VL-NEXT: movw %si, (%rdi)
3617 ; AVX512VL-NEXT: movw %dx, 6(%rdi)
3618 ; AVX512VL-NEXT: movw %cx, 4(%rdi)
3619 ; AVX512VL-NEXT: movw %ax, 2(%rdi)
3620 ; AVX512VL-NEXT: retq
2592 ; ALL-LABEL: store_cvt_4f32_to_4i16:
2593 ; ALL: # BB#0:
2594 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2595 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2596 ; ALL-NEXT: vmovd %xmm1, %eax
2597 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2598 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2599 ; ALL-NEXT: vmovd %xmm1, %ecx
2600 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2601 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2602 ; ALL-NEXT: vmovd %xmm1, %edx
2603 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2604 ; ALL-NEXT: vmovd %xmm0, %esi
2605 ; ALL-NEXT: movw %si, (%rdi)
2606 ; ALL-NEXT: movw %dx, 6(%rdi)
2607 ; ALL-NEXT: movw %cx, 4(%rdi)
2608 ; ALL-NEXT: movw %ax, 2(%rdi)
2609 ; ALL-NEXT: retq
36212610 %1 = fptrunc <4 x float> %a0 to <4 x half>
36222611 %2 = bitcast <4 x half> %1 to <4 x i16>
36232612 store <4 x i16> %2, <4 x i16>* %a1
36792668 ;
36802669 ; AVX512F-LABEL: store_cvt_4f32_to_8i16_undef:
36812670 ; AVX512F: # BB#0:
3682 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
3683 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
2671 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2672 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
36842673 ; AVX512F-NEXT: vmovd %xmm1, %eax
3685 ; AVX512F-NEXT: movzwl %ax, %eax
3686 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3687 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2674 ; AVX512F-NEXT: shll $16, %eax
2675 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
36882676 ; AVX512F-NEXT: vmovd %xmm1, %ecx
3689 ; AVX512F-NEXT: shll $16, %ecx
2677 ; AVX512F-NEXT: movzwl %cx, %ecx
36902678 ; AVX512F-NEXT: orl %eax, %ecx
3691 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3692 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2679 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2680 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
36932681 ; AVX512F-NEXT: vmovd %xmm1, %eax
3694 ; AVX512F-NEXT: movzwl %ax, %eax
3695 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3696 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
2682 ; AVX512F-NEXT: shll $16, %eax
2683 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2684 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
36972685 ; AVX512F-NEXT: vmovd %xmm0, %edx
3698 ; AVX512F-NEXT: shll $16, %edx
2686 ; AVX512F-NEXT: movzwl %dx, %edx
36992687 ; AVX512F-NEXT: orl %eax, %edx
37002688 ; AVX512F-NEXT: shlq $32, %rdx
37012689 ; AVX512F-NEXT: orq %rcx, %rdx
37022690 ; AVX512F-NEXT: vmovq %rdx, %xmm0
37032691 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
37042692 ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
3705 ; AVX512F-NEXT: vzeroupper
37062693 ; AVX512F-NEXT: retq
37072694 ;
37082695 ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
37932780 ;
37942781 ; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
37952782 ; AVX512F: # BB#0:
3796 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0
3797 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
2783 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2784 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
37982785 ; AVX512F-NEXT: vmovd %xmm1, %eax
3799 ; AVX512F-NEXT: movzwl %ax, %eax
3800 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3801 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2786 ; AVX512F-NEXT: shll $16, %eax
2787 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
38022788 ; AVX512F-NEXT: vmovd %xmm1, %ecx
3803 ; AVX512F-NEXT: shll $16, %ecx
2789 ; AVX512F-NEXT: movzwl %cx, %ecx
38042790 ; AVX512F-NEXT: orl %eax, %ecx
3805 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3806 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
2791 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2792 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
38072793 ; AVX512F-NEXT: vmovd %xmm1, %eax
3808 ; AVX512F-NEXT: movzwl %ax, %eax
3809 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3810 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
2794 ; AVX512F-NEXT: shll $16, %eax
2795 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2796 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
38112797 ; AVX512F-NEXT: vmovd %xmm0, %edx
3812 ; AVX512F-NEXT: shll $16, %edx
2798 ; AVX512F-NEXT: movzwl %dx, %edx
38132799 ; AVX512F-NEXT: orl %eax, %edx
38142800 ; AVX512F-NEXT: shlq $32, %rdx
38152801 ; AVX512F-NEXT: orq %rcx, %rdx
38162802 ; AVX512F-NEXT: vmovq %rdx, %xmm0
38172803 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
38182804 ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
3819 ; AVX512F-NEXT: vzeroupper
38202805 ; AVX512F-NEXT: retq
38212806 ;
38222807 ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
38552840 }
38562841
38572842 define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
3858 ; AVX1-LABEL: store_cvt_8f32_to_8i16:
3859 ; AVX1: # BB#0:
3860 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3861 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3862 ; AVX1-NEXT: vmovd %xmm1, %r8d
3863 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3864 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3865 ; AVX1-NEXT: vmovd %xmm1, %r9d
3866 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3867 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3868 ; AVX1-NEXT: vmovd %xmm1, %r10d
3869 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3870 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3871 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3872 ; AVX1-NEXT: vmovd %xmm2, %r11d
3873 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3874 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3875 ; AVX1-NEXT: vmovd %xmm2, %eax
3876 ; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3877 ; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3878 ; AVX1-NEXT: vmovd %xmm2, %ecx
3879 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3880 ; AVX1-NEXT: vmovd %xmm0, %edx
3881 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
3882 ; AVX1-NEXT: vmovd %xmm0, %esi
3883 ; AVX1-NEXT: movw %si, 8(%rdi)
3884 ; AVX1-NEXT: movw %dx, (%rdi)
3885 ; AVX1-NEXT: movw %cx, 14(%rdi)
3886 ; AVX1-NEXT: movw %ax, 12(%rdi)
3887 ; AVX1-NEXT: movw %r11w, 10(%rdi)
3888 ; AVX1-NEXT: movw %r10w, 6(%rdi)
3889 ; AVX1-NEXT: movw %r9w, 4(%rdi)
3890 ; AVX1-NEXT: movw %r8w, 2(%rdi)
3891 ; AVX1-NEXT: vzeroupper
3892 ; AVX1-NEXT: retq
3893 ;
3894 ; AVX2-LABEL: store_cvt_8f32_to_8i16:
3895 ; AVX2: # BB#0:
3896 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3897 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3898 ; AVX2-NEXT: vmovd %xmm1, %r8d
3899 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3900 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3901 ; AVX2-NEXT: vmovd %xmm1, %r9d
3902 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3903 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3904 ; AVX2-NEXT: vmovd %xmm1, %r10d
3905 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
3906 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3907 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3908 ; AVX2-NEXT: vmovd %xmm2, %r11d
3909 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3910 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3911 ; AVX2-NEXT: vmovd %xmm2, %eax
3912 ; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3913 ; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3914 ; AVX2-NEXT: vmovd %xmm2, %ecx
3915 ; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3916 ; AVX2-NEXT: vmovd %xmm0, %edx
3917 ; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
3918 ; AVX2-NEXT: vmovd %xmm0, %esi
3919 ; AVX2-NEXT: movw %si, 8(%rdi)
3920 ; AVX2-NEXT: movw %dx, (%rdi)
3921 ; AVX2-NEXT: movw %cx, 14(%rdi)
3922 ; AVX2-NEXT: movw %ax, 12(%rdi)
3923 ; AVX2-NEXT: movw %r11w, 10(%rdi)
3924 ; AVX2-NEXT: movw %r10w, 6(%rdi)
3925 ; AVX2-NEXT: movw %r9w, 4(%rdi)
3926 ; AVX2-NEXT: movw %r8w, 2(%rdi)
3927 ; AVX2-NEXT: vzeroupper
3928 ; AVX2-NEXT: retq
3929 ;
3930 ; AVX512F-LABEL: store_cvt_8f32_to_8i16:
3931 ; AVX512F: # BB#0:
3932 ; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0
3933 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3934 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3935 ; AVX512F-NEXT: vmovd %xmm1, %r8d
3936 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3937 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3938 ; AVX512F-NEXT: vmovd %xmm1, %r9d
3939 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3940 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
3941 ; AVX512F-NEXT: vmovd %xmm1, %r10d
3942 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
3943 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3944 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
3945 ; AVX512F-NEXT: vmovd %xmm2, %r11d
3946 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3947 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
3948 ; AVX512F-NEXT: vmovd %xmm2, %eax
3949 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3950 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
3951 ; AVX512F-NEXT: vmovd %xmm2, %ecx
3952 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
3953 ; AVX512F-NEXT: vmovd %xmm0, %edx
3954 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
3955 ; AVX512F-NEXT: vmovd %xmm0, %esi
3956 ; AVX512F-NEXT: movw %si, 8(%rdi)
3957 ; AVX512F-NEXT: movw %dx, (%rdi)
3958 ; AVX512F-NEXT: movw %cx, 14(%rdi)
3959 ; AVX512F-NEXT: movw %ax, 12(%rdi)
3960 ; AVX512F-NEXT: movw %r11w, 10(%rdi)
3961 ; AVX512F-NEXT: movw %r10w, 6(%rdi)
3962 ; AVX512F-NEXT: movw %r9w, 4(%rdi)
3963 ; AVX512F-NEXT: movw %r8w, 2(%rdi)
3964 ; AVX512F-NEXT: vzeroupper
3965 ; AVX512F-NEXT: retq
3966 ;
3967 ; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
3968 ; AVX512VL: # BB#0:
3969 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
3970 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3971 ; AVX512VL-NEXT: vmovd %xmm1, %r8d
3972 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
3973 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3974 ; AVX512VL-NEXT: vmovd %xmm1, %r9d
3975 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
3976 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3977 ; AVX512VL-NEXT: vmovd %xmm1, %r10d
3978 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
3979 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3980 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3981 ; AVX512VL-NEXT: vmovd %xmm2, %r11d
3982 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
3983 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3984 ; AVX512VL-NEXT: vmovd %xmm2, %eax
3985 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3986 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3987 ; AVX512VL-NEXT: vmovd %xmm2, %ecx
3988 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3989 ; AVX512VL-NEXT: vmovd %xmm0, %edx
3990 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
3991 ; AVX512VL-NEXT: vmovd %xmm0, %esi
3992 ; AVX512VL-NEXT: movw %si, 8(%rdi)
3993 ; AVX512VL-NEXT: movw %dx, (%rdi)
3994 ; AVX512VL-NEXT: movw %cx, 14(%rdi)
3995 ; AVX512VL-NEXT: movw %ax, 12(%rdi)
3996 ; AVX512VL-NEXT: movw %r11w, 10(%rdi)
3997 ; AVX512VL-NEXT: movw %r10w, 6(%rdi)
3998 ; AVX512VL-NEXT: movw %r9w, 4(%rdi)
3999 ; AVX512VL-NEXT: movw %r8w, 2(%rdi)
4000 ; AVX512VL-NEXT: vzeroupper
4001 ; AVX512VL-NEXT: retq
2843 ; ALL-LABEL: store_cvt_8f32_to_8i16:
2844 ; ALL: # BB#0:
2845 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2846 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2847 ; ALL-NEXT: vmovd %xmm1, %r8d
2848 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2849 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2850 ; ALL-NEXT: vmovd %xmm1, %r9d
2851 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2852 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2853 ; ALL-NEXT: vmovd %xmm1, %r10d
2854 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
2855 ; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
2856 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2857 ; ALL-NEXT: vmovd %xmm2, %r11d
2858 ; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
2859 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2860 ; ALL-NEXT: vmovd %xmm2, %eax
2861 ; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
2862 ; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
2863 ; ALL-NEXT: vmovd %xmm2, %ecx
2864 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2865 ; ALL-NEXT: vmovd %xmm0, %edx
2866 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
2867 ; ALL-NEXT: vmovd %xmm0, %esi
2868 ; ALL-NEXT: movw %si, 8(%rdi)
2869 ; ALL-NEXT: movw %dx, (%rdi)
2870 ; ALL-NEXT: movw %cx, 14(%rdi)
2871 ; ALL-NEXT: movw %ax, 12(%rdi)
2872 ; ALL-NEXT: movw %r11w, 10(%rdi)
2873 ; ALL-NEXT: movw %r10w, 6(%rdi)
2874 ; ALL-NEXT: movw %r9w, 4(%rdi)
2875 ; ALL-NEXT: movw %r8w, 2(%rdi)
2876 ; ALL-NEXT: vzeroupper
2877 ; ALL-NEXT: retq
40022878 %1 = fptrunc <8 x float> %a0 to <8 x half>
40032879 %2 = bitcast <8 x half> %1 to <8 x i16>
40042880 store <8 x i16> %2, <8 x i16>* %a1
41403016 ; AVX2-NEXT: vzeroupper
41413017 ; AVX2-NEXT: retq
41423018 ;
4143 ; AVX512F-LABEL: store_cvt_16f32_to_16i16:
4144 ; AVX512F: # BB#0:
4145 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
4146 ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4147 ; AVX512F-NEXT: vextractf128 $1, %ymm2, %xmm3
4148 ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm4
4149 ; AVX512F-NEXT: vmovd %xmm4, %eax
4150 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm4
4151 ; AVX512F-NEXT: movw %ax, 24(%rdi)
4152 ; AVX512F-NEXT: vmovd %xmm4, %eax
4153 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm4
4154 ; AVX512F-NEXT: movw %ax, 16(%rdi)
4155 ; AVX512F-NEXT: vmovd %xmm4, %eax
4156 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm4
4157 ; AVX512F-NEXT: movw %ax, 8(%rdi)
4158 ; AVX512F-NEXT: vmovd %xmm4, %eax
4159 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
4160 ; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
4161 ; AVX512F-NEXT: movw %ax, (%rdi)
4162 ; AVX512F-NEXT: vmovd %xmm4, %eax
4163 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
4164 ; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
4165 ; AVX512F-NEXT: movw %ax, 30(%rdi)
4166 ; AVX512F-NEXT: vmovd %xmm4, %eax
4167 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
4168 ; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
4169 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
4170 ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
4171 ; AVX512F-NEXT: movw %ax, 28(%rdi)
4172 ; AVX512F-NEXT: vmovd %xmm3, %eax
4173 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
4174 ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
4175 ; AVX512F-NEXT: movw %ax, 26(%rdi)
4176 ; AVX512F-NEXT: vmovd %xmm3, %eax
4177 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
4178 ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
4179 ; AVX512F-NEXT: movw %ax, 22(%rdi)
4180 ; AVX512F-NEXT: vmovd %xmm3, %eax
4181 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
4182 ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
4183 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
4184 ; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
4185 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
4186 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
4187 ; AVX512F-NEXT: movw %ax, 20(%rdi)
4188 ; AVX512F-NEXT: vmovd %xmm2, %eax
4189 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
4190 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
4191 ; AVX512F-NEXT: movw %ax, 18(%rdi)
4192 ; AVX512F-NEXT: vmovd %xmm2, %eax
4193 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
4194 ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
4195 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
4196 ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
4197 ; AVX512F-NEXT: movw %ax, 14(%rdi)
4198 ; AVX512F-NEXT: vmovd %xmm1, %eax
4199 ; AVX512F-NEXT: movw %ax, 12(%rdi)
4200 ; AVX512F-NEXT: vmovd %xmm2, %eax
4201 ; AVX512F-NEXT: movw %ax, 10(%rdi)
4202 ; AVX512F-NEXT: vmovd %xmm0, %eax
4203 ; AVX512F-NEXT: movw %ax, 6(%rdi)
4204 ; AVX512F-NEXT: vmovd %xmm3, %eax
4205 ; AVX512F-NEXT: movw %ax, 4(%rdi)
4206 ; AVX512F-NEXT: vmovd %xmm4, %eax
4207 ; AVX512F-NEXT: movw %ax, 2(%rdi)
4208 ; AVX512F-NEXT: vzeroupper
4209 ; AVX512F-NEXT: retq
4210 ;
4211 ; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
4212 ; AVX512VL: # BB#0:
4213 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
4214 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
4215 ; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3
4216 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4
4217 ; AVX512VL-NEXT: vmovd %xmm4, %eax
4218 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4
4219 ; AVX512VL-NEXT: movw %ax, 24(%rdi)
4220 ; AVX512VL-NEXT: vmovd %xmm4, %eax
4221 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm4
4222 ; AVX512VL-NEXT: movw %ax, 16(%rdi)
4223 ; AVX512VL-NEXT: vmovd %xmm4, %eax
4224 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm4
4225 ; AVX512VL-NEXT: movw %ax, 8(%rdi)
4226 ; AVX512VL-NEXT: vmovd %xmm4, %eax
4227 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
4228 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
4229 ; AVX512VL-NEXT: movw %ax, (%rdi)
4230 ; AVX512VL-NEXT: vmovd %xmm4, %eax
4231 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
4232 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
4233 ; AVX512VL-NEXT: movw %ax, 30(%rdi)
4234 ; AVX512VL-NEXT: vmovd %xmm4, %eax
4235 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
4236 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
4237 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
4238 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
4239 ; AVX512VL-NEXT: movw %ax, 28(%rdi)
4240 ; AVX512VL-NEXT: vmovd %xmm3, %eax
4241 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
4242 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
4243 ; AVX512VL-NEXT: movw %ax, 26(%rdi)
4244 ; AVX512VL-NEXT: vmovd %xmm3, %eax
4245 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
4246 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
4247 ; AVX512VL-NEXT: movw %ax, 22(%rdi)
4248 ; AVX512VL-NEXT: vmovd %xmm3, %eax
4249 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
4250 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
4251 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
4252 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
4253 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
4254 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
4255 ; AVX512VL-NEXT: movw %ax, 20(%rdi)
4256 ; AVX512VL-NEXT: vmovd %xmm2, %eax
4257 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
4258 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
4259 ; AVX512VL-NEXT: movw %ax, 18(%rdi)
4260 ; AVX512VL-NEXT: vmovd %xmm2, %eax
4261 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
4262 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
4263 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
4264 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
4265 ; AVX512VL-NEXT: movw %ax, 14(%rdi)
4266 ; AVX512VL-NEXT: vmovd %xmm1, %eax
4267 ; AVX512VL-NEXT: movw %ax, 12(%rdi)
4268 ; AVX512VL-NEXT: vmovd %xmm2, %eax
4269 ; AVX512VL-NEXT: movw %ax, 10(%rdi)
4270 ; AVX512VL-NEXT: vmovd %xmm0, %eax
4271 ; AVX512VL-NEXT: movw %ax, 6(%rdi)
4272 ; AVX512VL-NEXT: vmovd %xmm3, %eax
4273 ; AVX512VL-NEXT: movw %ax, 4(%rdi)
4274 ; AVX512VL-NEXT: vmovd %xmm4, %eax
4275 ; AVX512VL-NEXT: movw %ax, 2(%rdi)
4276 ; AVX512VL-NEXT: vzeroupper
4277 ; AVX512VL-NEXT: retq
3019 ; AVX512-LABEL: store_cvt_16f32_to_16i16:
3020 ; AVX512: # BB#0:
3021 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
3022 ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3023 ; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
3024 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
3025 ; AVX512-NEXT: vmovd %xmm4, %eax
3026 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
3027 ; AVX512-NEXT: movw %ax, 24(%rdi)
3028 ; AVX512-NEXT: vmovd %xmm4, %eax
3029 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
3030 ; AVX512-NEXT: movw %ax, 16(%rdi)
3031 ; AVX512-NEXT: vmovd %xmm4, %eax
3032 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
3033 ; AVX512-NEXT: movw %ax, 8(%rdi)
3034 ; AVX512-NEXT: vmovd %xmm4, %eax
3035 ; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
3036 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
3037 ; AVX512-NEXT: movw %ax, (%rdi)
3038 ; AVX512-NEXT: vmovd %xmm4, %eax
3039 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
3040 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
3041 ; AVX512-NEXT: movw %ax, 30(%rdi)
3042 ; AVX512-NEXT: vmovd %xmm4, %eax
3043 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
3044 ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
3045 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
3046 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
3047 ; AVX512-NEXT: movw %ax, 28(%rdi)
3048 ; AVX512-NEXT: vmovd %xmm3, %eax
3049 ; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
3050 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
3051 ; AVX512-NEXT: movw %ax, 26(%rdi)
3052 ; AVX512-NEXT: vmovd %xmm3, %eax
3053 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
3054 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
3055 ; AVX512-NEXT: movw %ax, 22(%rdi)
3056 ; AVX512-NEXT: vmovd %xmm3, %eax
3057 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
3058 ; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
3059 ; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
3060 ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
3061 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
3062 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3063 ; AVX512-NEXT: movw %ax, 20(%rdi)
3064 ; AVX512-NEXT: vmovd %xmm2, %eax
3065 ; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
3066 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3067 ; AVX512-NEXT: movw %ax, 18(%rdi)
3068 ; AVX512-NEXT: vmovd %xmm2, %eax
3069 ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
3070 ; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
3071 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
3072 ; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
3073 ; AVX512-NEXT: movw %ax, 14(%rdi)
3074 ; AVX512-NEXT: vmovd %xmm1, %eax
3075 ; AVX512-NEXT: movw %ax, 12(%rdi)
3076 ; AVX512-NEXT: vmovd %xmm2, %eax
3077 ; AVX512-NEXT: movw %ax, 10(%rdi)
3078 ; AVX512-NEXT: vmovd %xmm0, %eax
3079 ; AVX512-NEXT: movw %ax, 6(%rdi)
3080 ; AVX512-NEXT: vmovd %xmm3, %eax
3081 ; AVX512-NEXT: movw %ax, 4(%rdi)
3082 ; AVX512-NEXT: vmovd %xmm4, %eax
3083 ; AVX512-NEXT: movw %ax, 2(%rdi)
3084 ; AVX512-NEXT: vzeroupper
3085 ; AVX512-NEXT: retq
42783086 %1 = fptrunc <16 x float> %a0 to <16 x half>
42793087 %2 = bitcast <16 x half> %1 to <16 x i16>
42803088 store <16 x i16> %2, <16 x i16>* %a1