llvm.org GIT mirror llvm / b34ced6
[X86] Add load folding isel patterns to scalar_math_patterns and AVX512_scalar_math_fp_patterns. Also add a FIXME for the peephole pass not being able to handle this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363032 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 4 months ago
5 changed file(s) with 64 addition(s) and 45 deletion(s). Raw diff Collapse all Expand all
1187011870 _.FRC:$src)))),
1187111871 (!cast("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
1187211872 (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
11873 def : Pat<(MoveNode
11874 (_.VT VR128X:$dst),
11875 (_.VT (scalar_to_vector
11876 (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
11877 (_.ScalarLdFrag addr:$src))))),
11878 (!cast("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>;
1187311879
1187411880 // extracted masked scalar math op with insert via movss
1187511881 def : Pat<(MoveNode (_.VT VR128X:$src1),
1188311889 (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
1188411890 VK1WM:$mask, _.VT:$src1,
1188511891 (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
11892 def : Pat<(MoveNode (_.VT VR128X:$src1),
11893 (scalar_to_vector
11894 (X86selects VK1WM:$mask,
11895 (Op (_.EltVT
11896 (extractelt (_.VT VR128X:$src1), (iPTR 0))),
11897 (_.ScalarLdFrag addr:$src2)),
11898 _.FRC:$src0))),
11899 (!cast("V"#OpcPrefix#Zrm_Intk)
11900 (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
11901 VK1WM:$mask, _.VT:$src1, addr:$src2)>;
1188611902
1188711903 // extracted masked scalar math op with insert via movss
1188811904 def : Pat<(MoveNode (_.VT VR128X:$src1),
1189411910 (!cast("V"#OpcPrefix#Zrr_Intkz)
1189511911 VK1WM:$mask, _.VT:$src1,
1189611912 (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
11913 def : Pat<(MoveNode (_.VT VR128X:$src1),
11914 (scalar_to_vector
11915 (X86selects VK1WM:$mask,
11916 (Op (_.EltVT
11917 (extractelt (_.VT VR128X:$src1), (iPTR 0))),
11918 (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
11919 (!cast("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>;
1189711920 }
1189811921 }
1189911922
46844684 &RI, MF);
46854685 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
46864686 if (Size < RCSize) {
4687 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
46874688 // Check if it's safe to fold the load. If the size of the object is
46884689 // narrower than the load width, then it's not.
46894690 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
26912691 // patterns we have to try to match.
26922692 multiclass scalar_math_patterns
26932693 ValueType VT, ValueType EltTy,
2694 RegisterClass RC, Predicate BasePredicate> {
2694 RegisterClass RC, PatFrag ld_frag,
2695 Predicate BasePredicate> {
26952696 let Predicates = [BasePredicate] in {
26962697 // extracted scalar math op with insert via movss/movsd
26972698 def : Pat<(VT (Move (VT VR128:$dst),
27002701 RC:$src))))),
27012702 (!cast(OpcPrefix#rr_Int) VT:$dst,
27022703 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2704 def : Pat<(VT (Move (VT VR128:$dst),
2705 (VT (scalar_to_vector
2706 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2707 (ld_frag addr:$src)))))),
2708 (!cast(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
27032709 }
27042710
27052711 // Repeat for AVX versions of the instructions.
27112717 RC:$src))))),
27122718 (!cast("V"#OpcPrefix#rr_Int) VT:$dst,
27132719 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2720 def : Pat<(VT (Move (VT VR128:$dst),
2721 (VT (scalar_to_vector
2722 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2723 (ld_frag addr:$src)))))),
2724 (!cast("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
27142725 }
27152726 }
27162727
2717 defm : scalar_math_patterns;
2718 defm : scalar_math_patterns;
2719 defm : scalar_math_patterns;
2720 defm : scalar_math_patterns;
2721
2722 defm : scalar_math_patterns;
2723 defm : scalar_math_patterns;
2724 defm : scalar_math_patterns;
2725 defm : scalar_math_patternsdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
2728 defm : scalar_math_patternsadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2729 defm : scalar_math_patterns;
2730 defm : scalar_math_patterns;
2731 defm : scalar_math_patterns;
2732
2733 defm : scalar_math_patterns;
2734 defm : scalar_math_patterns;
2735 defm : scalar_math_patterns;
2736 defm : scalar_math_patterns;
27262737
27272738 /// Unop Arithmetic
27282739 /// In addition, we also have a special variant of the scalar form here to
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
1 ; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
2 ; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
44
55 ; Verify that we're folding the load into the math instruction.
66 ; This pattern is generated out of the simplest intrinsics usage:
413413 define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
414414 ; X86-SSE-LABEL: blend_add_ss:
415415 ; X86-SSE: # %bb.0:
416 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
417 ; X86-SSE-NEXT: addss %xmm1, %xmm0
416 ; X86-SSE-NEXT: addss {{[0-9]+}}(%esp), %xmm0
418417 ; X86-SSE-NEXT: retl
419418 ;
420419 ; X86-AVX-LABEL: blend_add_ss:
421420 ; X86-AVX: # %bb.0:
422 ; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
423 ; X86-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
421 ; X86-AVX-NEXT: vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
424422 ; X86-AVX-NEXT: retl
425423 ;
426424 ; X64-SSE-LABEL: blend_add_ss:
443441 define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
444442 ; X86-SSE-LABEL: blend_sub_ss:
445443 ; X86-SSE: # %bb.0:
446 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
447 ; X86-SSE-NEXT: subss %xmm1, %xmm0
444 ; X86-SSE-NEXT: subss {{[0-9]+}}(%esp), %xmm0
448445 ; X86-SSE-NEXT: retl
449446 ;
450447 ; X86-AVX-LABEL: blend_sub_ss:
451448 ; X86-AVX: # %bb.0:
452 ; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
453 ; X86-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
449 ; X86-AVX-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0
454450 ; X86-AVX-NEXT: retl
455451 ;
456452 ; X64-SSE-LABEL: blend_sub_ss:
473469 define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
474470 ; X86-SSE-LABEL: blend_mul_ss:
475471 ; X86-SSE: # %bb.0:
476 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
477 ; X86-SSE-NEXT: mulss %xmm1, %xmm0
472 ; X86-SSE-NEXT: mulss {{[0-9]+}}(%esp), %xmm0
478473 ; X86-SSE-NEXT: retl
479474 ;
480475 ; X86-AVX-LABEL: blend_mul_ss:
481476 ; X86-AVX: # %bb.0:
482 ; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
483 ; X86-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
477 ; X86-AVX-NEXT: vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
484478 ; X86-AVX-NEXT: retl
485479 ;
486480 ; X64-SSE-LABEL: blend_mul_ss:
503497 define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
504498 ; X86-SSE-LABEL: blend_div_ss:
505499 ; X86-SSE: # %bb.0:
506 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
507 ; X86-SSE-NEXT: divss %xmm1, %xmm0
500 ; X86-SSE-NEXT: divss {{[0-9]+}}(%esp), %xmm0
508501 ; X86-SSE-NEXT: retl
509502 ;
510503 ; X86-AVX-LABEL: blend_div_ss:
511504 ; X86-AVX: # %bb.0:
512 ; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
513 ; X86-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
505 ; X86-AVX-NEXT: vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0
514506 ; X86-AVX-NEXT: retl
515507 ;
516508 ; X64-SSE-LABEL: blend_div_ss:
533525 define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
534526 ; X86-SSE-LABEL: blend_add_sd:
535527 ; X86-SSE: # %bb.0:
536 ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
537 ; X86-SSE-NEXT: addsd %xmm1, %xmm0
528 ; X86-SSE-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
538529 ; X86-SSE-NEXT: retl
539530 ;
540531 ; X86-AVX-LABEL: blend_add_sd:
541532 ; X86-AVX: # %bb.0:
542 ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
543 ; X86-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
533 ; X86-AVX-NEXT: vaddsd {{[0-9]+}}(%esp), %xmm0, %xmm0
544534 ; X86-AVX-NEXT: retl
545535 ;
546536 ; X64-SSE-LABEL: blend_add_sd:
563553 define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
564554 ; X86-SSE-LABEL: blend_sub_sd:
565555 ; X86-SSE: # %bb.0:
566 ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
567 ; X86-SSE-NEXT: subsd %xmm1, %xmm0
556 ; X86-SSE-NEXT: subsd {{[0-9]+}}(%esp), %xmm0
568557 ; X86-SSE-NEXT: retl
569558 ;
570559 ; X86-AVX-LABEL: blend_sub_sd:
571560 ; X86-AVX: # %bb.0:
572 ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
573 ; X86-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
561 ; X86-AVX-NEXT: vsubsd {{[0-9]+}}(%esp), %xmm0, %xmm0
574562 ; X86-AVX-NEXT: retl
575563 ;
576564 ; X64-SSE-LABEL: blend_sub_sd:
593581 define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
594582 ; X86-SSE-LABEL: blend_mul_sd:
595583 ; X86-SSE: # %bb.0:
596 ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
597 ; X86-SSE-NEXT: mulsd %xmm1, %xmm0
584 ; X86-SSE-NEXT: mulsd {{[0-9]+}}(%esp), %xmm0
598585 ; X86-SSE-NEXT: retl
599586 ;
600587 ; X86-AVX-LABEL: blend_mul_sd:
601588 ; X86-AVX: # %bb.0:
602 ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
603 ; X86-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
589 ; X86-AVX-NEXT: vmulsd {{[0-9]+}}(%esp), %xmm0, %xmm0
604590 ; X86-AVX-NEXT: retl
605591 ;
606592 ; X64-SSE-LABEL: blend_mul_sd:
623609 define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
624610 ; X86-SSE-LABEL: blend_div_sd:
625611 ; X86-SSE: # %bb.0:
626 ; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
627 ; X86-SSE-NEXT: divsd %xmm1, %xmm0
612 ; X86-SSE-NEXT: divsd {{[0-9]+}}(%esp), %xmm0
628613 ; X86-SSE-NEXT: retl
629614 ;
630615 ; X86-AVX-LABEL: blend_div_sd:
631616 ; X86-AVX: # %bb.0:
632 ; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
633 ; X86-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
617 ; X86-AVX-NEXT: vdivsd {{[0-9]+}}(%esp), %xmm0, %xmm0
634618 ; X86-AVX-NEXT: retl
635619 ;
636620 ; X64-SSE-LABEL: blend_div_sd: