llvm.org GIT mirror llvm / 1b23a75
[X86] Don't use RCP14 and RSQRT14 for reciprocal estimations or for legacy SSE rcp/rsqrt intrinsics when AVX512 features are enabled. Summary: AVX512 added RCP14 and RSQRT instructions which improve accuracy over the legacy RCP and RSQRT instruction, but not enough accuracy to remove the need for a Newton Raphson refinement. Currently we use these new instructions for the legacy packed SSE instrinics, but not the scalar instrinsics. And we use it for fast math optimization of division and reciprocal sqrt. I think switching the legacy instrinsics maybe surprising to the user since it changes the answer based on which processor you're using regardless of any fastmath settings. It's also weird that we did something different between scalar and packed. As far at the reciprocal estimation, I think it creates unnecessary deltas in our output behavior (and prevents EVEX->VEX). A little playing around with gcc and icc and godbolt suggest they don't change which instructions they use here. This patch adds new X86ISD nodes for the RCP14/RSQRT14 and uses those for the new intrinsics. Leaving the old intrinsics to use the old instructions. Going forward I think our focus should be on -Supporting 512-bit vectors, which will have to use the RCP14/RSQRT14. -Using RSQRT28/RCP28 to remove the Newton Raphson step on processors with AVX512ER -Supporting double precision. Reviewers: zvi, DavidKreitzer, RKSimon Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39583 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317413 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 2 years ago
12 changed file(s) with 71 addition(s) and 85 deletion(s). Raw diff Collapse all Expand all
2480724807 case X86ISD::FMAXC: return "X86ISD::FMAXC";
2480824808 case X86ISD::FMINC: return "X86ISD::FMINC";
2480924809 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24810 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
2481124810 case X86ISD::FRCP: return "X86ISD::FRCP";
24812 case X86ISD::FRCPS: return "X86ISD::FRCPS";
2481324811 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
2481424812 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
2481524813 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
2498724985 case X86ISD::SELECT: return "X86ISD::SELECT";
2498824986 case X86ISD::SELECTS: return "X86ISD::SELECTS";
2498924987 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24988 case X86ISD::RCP14: return "X86ISD::RCP14";
24989 case X86ISD::RCP14S: return "X86ISD::RCP14S";
2499024990 case X86ISD::RCP28: return "X86ISD::RCP28";
2499124991 case X86ISD::RCP28S: return "X86ISD::RCP28S";
2499224992 case X86ISD::EXP2: return "X86ISD::EXP2";
24993 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
24994 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
2499324995 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
2499424996 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
2499524997 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
253253 /// Note that these typically require refinement
254254 /// in order to obtain suitable precision.
255255 FRSQRT, FRCP,
256 FRSQRTS, FRCPS,
256
257 // AVX-512 reciprocal approximations with a little more precision.
258 RSQRT14, RSQRT14S, RCP14, RCP14S,
257259
258260 // Thread Local Storage.
259261 TLSADDR,
73617361 }
73627362 }
73637363
7364 defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
7364 defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, f32x_info>,
73657365 EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
7366 defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
7366 defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, f64x_info>,
73677367 VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
7368 defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
7368 defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, f32x_info>,
73697369 EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
7370 defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
7370 defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, f64x_info>,
73717371 VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
73727372
73737373 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
74137413 }
74147414 }
74157415
7416 defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
7417 defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
7416 defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14>;
7417 defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14>;
74187418
74197419 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
74207420 multiclass avx512_fp28_s opc, string OpcodeStr,X86VectorVTInfo _,
5555 def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>;
5656 def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
5757 def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
58 def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>;
59 def X86frcp14s : SDNode<"X86ISD::FRCPS", SDTFPBinOp>;
6058 def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
6159 def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
6260 def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
497495 def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
498496 def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>;
499497
498 def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>;
499 def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>;
500500 def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
501501 def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
502502 def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>;
503503
504 def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>;
505 def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>;
504506 def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>;
505507 def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>;
506508 def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
32513251 // Reciprocal approximations. Note that these typically require refinement
32523252 // in order to obtain suitable precision.
32533253 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
3254 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
3254 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>;
32553255 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
3256 sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
3256 sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>;
32573257
32583258 // There is no f64 version of the reciprocal approximation instructions.
32593259
14271427 X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
14281428 X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
14291429 X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
1430 X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
1431 X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
1432 X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
1433 X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
1434 X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
1435 X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
1436 X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
1437 X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
1430 X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
1431 X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
1432 X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
1433 X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
1434 X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
1435 X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
1436 X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
1437 X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
14381438 X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
14391439 X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
14401440 X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
14411441 X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
1442 X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
1443 X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
1444 X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
1445 X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
1446 X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
1447 X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
1448 X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
1449 X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
1442 X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
1443 X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
1444 X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
1445 X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
1446 X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
1447 X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
1448 X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
1449 X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
14501450 X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
14511451 X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
14521452 X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
580580
581581
582582 define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
583 ; AVX-LABEL: test_x86_avx_rcp_ps_256:
584 ; AVX: # BB#0:
585 ; AVX-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
586 ; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
587 ;
588 ; AVX512VL-LABEL: test_x86_avx_rcp_ps_256:
589 ; AVX512VL: # BB#0:
590 ; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
591 ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
583 ; CHECK-LABEL: test_x86_avx_rcp_ps_256:
584 ; CHECK: # BB#0:
585 ; CHECK-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
586 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
592587 %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
593588 ret <8 x float> %res
594589 }
618613
619614
620615 define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
621 ; AVX-LABEL: test_x86_avx_rsqrt_ps_256:
622 ; AVX: # BB#0:
623 ; AVX-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
624 ; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
625 ;
626 ; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256:
627 ; AVX512VL: # BB#0:
628 ; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
629 ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
616 ; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
617 ; CHECK: # BB#0:
618 ; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
619 ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
630620 %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
631621 ret <8 x float> %res
632622 }
39813981 ;
39823982 ; SKX-LABEL: test_rcpps:
39833983 ; SKX: # BB#0:
3984 ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
3985 ; SKX-NEXT: vrcp14ps (%rdi), %ymm1 # sched: [11:1.00]
3984 ; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
3985 ; SKX-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:1.00]
39863986 ; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
39873987 ; SKX-NEXT: retq # sched: [7:1.00]
39883988 ;
41734173 ;
41744174 ; SKX-LABEL: test_rsqrtps:
41754175 ; SKX: # BB#0:
4176 ; SKX-NEXT: vrsqrt14ps %ymm0, %ymm0 # sched: [4:1.00]
4177 ; SKX-NEXT: vrsqrt14ps (%rdi), %ymm1 # sched: [11:1.00]
4176 ; SKX-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [4:1.00]
4177 ; SKX-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:1.00]
41784178 ; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
41794179 ; SKX-NEXT: retq # sched: [7:1.00]
41804180 ;
415415 ;
416416 ; SKX-LABEL: v4f32_one_step:
417417 ; SKX: # BB#0:
418 ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
418 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
419419 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
420420 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
421421 ; SKX-NEXT: retq # sched: [7:1.00]
532532 ;
533533 ; SKX-LABEL: v4f32_two_step:
534534 ; SKX: # BB#0:
535 ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
535 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
536536 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
537537 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
538538 ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
690690 ;
691691 ; SKX-LABEL: v8f32_one_step:
692692 ; SKX: # BB#0:
693 ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
693 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
694694 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
695695 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
696696 ; SKX-NEXT: retq # sched: [7:1.00]
820820 ;
821821 ; SKX-LABEL: v8f32_two_step:
822822 ; SKX: # BB#0:
823 ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
823 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
824824 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
825825 ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
826826 ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
477477 ;
478478 ; SKX-LABEL: v4f32_one_step2:
479479 ; SKX: # BB#0:
480 ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
480 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
481481 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
482482 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
483483 ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
579579 ;
580580 ; SKX-LABEL: v4f32_one_step_2_divs:
581581 ; SKX: # BB#0:
582 ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
582 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
583583 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
584584 ; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
585585 ; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
707707 ;
708708 ; SKX-LABEL: v4f32_two_step2:
709709 ; SKX: # BB#0:
710 ; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
710 ; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
711711 ; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
712712 ; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
713713 ; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
813813 ;
814814 ; SKX-LABEL: v8f32_one_step2:
815815 ; SKX: # BB#0:
816 ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
816 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
817817 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
818818 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
819819 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
924924 ;
925925 ; SKX-LABEL: v8f32_one_step_2_divs:
926926 ; SKX: # BB#0:
927 ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
927 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
928928 ; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
929929 ; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
930930 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
10661066 ;
10671067 ; SKX-LABEL: v8f32_two_step2:
10681068 ; SKX: # BB#0:
1069 ; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
1069 ; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
10701070 ; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
10711071 ; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
10721072 ; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
11231123 ;
11241124 ; SKX-LABEL: v8f32_no_step:
11251125 ; SKX: # BB#0:
1126 ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
1126 ; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
11271127 ; SKX-NEXT: retq # sched: [7:1.00]
11281128 %div = fdiv fast <8 x float> , %x
11291129 ret <8 x float> %div
11821182 ;
11831183 ; SKX-LABEL: v8f32_no_step2:
11841184 ; SKX: # BB#0:
1185 ; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
1185 ; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
11861186 ; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
11871187 ; SKX-NEXT: retq # sched: [7:1.00]
11881188 %div = fdiv fast <8 x float> , %x
400400 ; SSE-NEXT: rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0]
401401 ; SSE-NEXT: retl ## encoding: [0xc3]
402402 ;
403 ; AVX2-LABEL: test_x86_sse_rcp_ps:
404 ; AVX2: ## BB#0:
405 ; AVX2-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
406 ; AVX2-NEXT: retl ## encoding: [0xc3]
407 ;
408 ; SKX-LABEL: test_x86_sse_rcp_ps:
409 ; SKX: ## BB#0:
410 ; SKX-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
411 ; SKX-NEXT: retl ## encoding: [0xc3]
403 ; VCHECK-LABEL: test_x86_sse_rcp_ps:
404 ; VCHECK: ## BB#0:
405 ; VCHECK-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
406 ; VCHECK-NEXT: retl ## encoding: [0xc3]
412407 %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
413408 ret <4 x float> %res
414409 }
437432 ; SSE-NEXT: rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0]
438433 ; SSE-NEXT: retl ## encoding: [0xc3]
439434 ;
440 ; AVX2-LABEL: test_x86_sse_rsqrt_ps:
441 ; AVX2: ## BB#0:
442 ; AVX2-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
443 ; AVX2-NEXT: retl ## encoding: [0xc3]
444 ;
445 ; SKX-LABEL: test_x86_sse_rsqrt_ps:
446 ; SKX: ## BB#0:
447 ; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
448 ; SKX-NEXT: retl ## encoding: [0xc3]
435 ; VCHECK-LABEL: test_x86_sse_rsqrt_ps:
436 ; VCHECK: ## BB#0:
437 ; VCHECK-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
438 ; VCHECK-NEXT: retl ## encoding: [0xc3]
449439 %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
450440 ret <4 x float> %res
451441 }
25462546 ;
25472547 ; SKX-LABEL: test_rcpps:
25482548 ; SKX: # BB#0:
2549 ; SKX-NEXT: vrcp14ps %xmm0, %xmm0 # sched: [4:1.00]
2550 ; SKX-NEXT: vrcp14ps (%rdi), %xmm1 # sched: [10:1.00]
2549 ; SKX-NEXT: vrcpps %xmm0, %xmm0 # sched: [4:1.00]
2550 ; SKX-NEXT: vrcpps (%rdi), %xmm1 # sched: [10:1.00]
25512551 ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
25522552 ; SKX-NEXT: retq # sched: [7:1.00]
25532553 ;
27182718 ;
27192719 ; SKX-LABEL: test_rsqrtps:
27202720 ; SKX: # BB#0:
2721 ; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 # sched: [4:1.00]
2722 ; SKX-NEXT: vrsqrt14ps (%rdi), %xmm1 # sched: [10:1.00]
2721 ; SKX-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [4:1.00]
2722 ; SKX-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
27232723 ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
27242724 ; SKX-NEXT: retq # sched: [7:1.00]
27252725 ;