llvm.org GIT mirror llvm / 3f74f4a
[X86] Convert f32/f64 FANDN/FAND/FOR/FXOR to vector logic ops and scalar_to_vector/extract_vector_elts to reduce isel patterns. Previously we did the equivalent operation in isel patterns with COPY_TO_REGCLASS operations to transition. By inserting scalar_to_vetors and extract_vector_elts before isel we can allow each piece to be selected individually and accomplish the same final result. I ideally we'd use vector operations earlier in lowering/combine, but that looks to be more difficult. The scalar-fp-to-i64.ll changes are because we have a pattern for using movlpd for store+extract_vector_elt. While an f64 store uses movsd. The encoding sizes are the same. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@362914 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 4 months ago
5 changed file(s) with 48 addition(s) and 143 deletion(s). Raw diff Collapse all Expand all
834834 N->getValueType(0),
835835 N->getOperand(0),
836836 CurDAG->getConstant(Imm, dl, MVT::i8));
837 --I;
838 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
839 ++I;
840 CurDAG->DeleteNode(N);
841 continue;
842 }
843 case X86ISD::FANDN:
844 case X86ISD::FAND:
845 case X86ISD::FOR:
846 case X86ISD::FXOR: {
847 // Widen scalar fp logic ops to vector to reduce isel patterns.
848 // FIXME: Can we do this during lowering/combine.
849 MVT VT = N->getSimpleValueType(0);
850 if (VT.isVector() || VT == MVT::f128)
851 break;
852
853 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
854 SDLoc dl(N);
855 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
856 N->getOperand(0));
857 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
858 N->getOperand(1));
859
860 SDValue Res;
861 if (Subtarget->hasSSE2()) {
862 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
863 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
864 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
865 unsigned Opc;
866 switch (N->getOpcode()) {
867 default: llvm_unreachable("Unexpected opcode!");
868 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
869 case X86ISD::FAND: Opc = ISD::AND; break;
870 case X86ISD::FOR: Opc = ISD::OR; break;
871 case X86ISD::FXOR: Opc = ISD::XOR; break;
872 }
873 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
874 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
875 } else {
876 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
877 }
878 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
879 CurDAG->getIntPtrConstant(0, dl));
837880 --I;
838881 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
839882 ++I;
56555655 SchedWriteFLogicSizes, 1>;
56565656 defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
56575657 SchedWriteFLogicSizes, 1>;
5658
5659 let Predicates = [HasVLX,HasDQI] in {
5660 // Use packed logical operations for scalar ops.
5661 def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
5662 (COPY_TO_REGCLASS
5663 (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
5664 (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
5665 FR64X)>;
5666 def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
5667 (COPY_TO_REGCLASS
5668 (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
5669 (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
5670 FR64X)>;
5671 def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
5672 (COPY_TO_REGCLASS
5673 (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
5674 (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
5675 FR64X)>;
5676 def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
5677 (COPY_TO_REGCLASS
5678 (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
5679 (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
5680 FR64X)>;
5681
5682 def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
5683 (COPY_TO_REGCLASS
5684 (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
5685 (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
5686 FR32X)>;
5687 def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
5688 (COPY_TO_REGCLASS
5689 (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
5690 (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
5691 FR32X)>;
5692 def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
5693 (COPY_TO_REGCLASS
5694 (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
5695 (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
5696 FR32X)>;
5697 def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
5698 (COPY_TO_REGCLASS
5699 (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
5700 (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
5701 FR32X)>;
5702 }
57035658
57045659 multiclass avx512_fp_scalef_p opc, string OpcodeStr, SDNode OpNode,
57055660 X86FoldableSchedWrite sched, X86VectorVTInfo _> {
24142414 (VANDNPSYrm VR256:$src1, addr:$src2)>;
24152415 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
24162416 (VANDNPSYrm VR256:$src1, addr:$src2)>;
2417 }
2418
2419 let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
2420 // Use packed logical operations for scalar ops.
2421 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2422 (COPY_TO_REGCLASS
2423 (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2424 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2425 FR64)>;
2426 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2427 (COPY_TO_REGCLASS
2428 (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2429 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2430 FR64)>;
2431 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2432 (COPY_TO_REGCLASS
2433 (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2434 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2435 FR64)>;
2436 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2437 (COPY_TO_REGCLASS
2438 (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2439 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2440 FR64)>;
2441
2442 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2443 (COPY_TO_REGCLASS
2444 (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2445 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2446 FR32)>;
2447 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2448 (COPY_TO_REGCLASS
2449 (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2450 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2451 FR32)>;
2452 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2453 (COPY_TO_REGCLASS
2454 (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2455 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2456 FR32)>;
2457 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2458 (COPY_TO_REGCLASS
2459 (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2460 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2461 FR32)>;
2462 }
2463
2464 let Predicates = [UseSSE1] in {
2465 // Use packed logical operations for scalar ops.
2466 def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
2467 (COPY_TO_REGCLASS
2468 (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2469 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2470 FR32)>;
2471 def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
2472 (COPY_TO_REGCLASS
2473 (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2474 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2475 FR32)>;
2476 def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
2477 (COPY_TO_REGCLASS
2478 (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2479 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2480 FR32)>;
2481 def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
2482 (COPY_TO_REGCLASS
2483 (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
2484 (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
2485 FR32)>;
2486 }
2487
2488 let Predicates = [UseSSE2] in {
2489 // Use packed logical operations for scalar ops.
2490 def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
2491 (COPY_TO_REGCLASS
2492 (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2493 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2494 FR64)>;
2495 def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
2496 (COPY_TO_REGCLASS
2497 (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2498 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2499 FR64)>;
2500 def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
2501 (COPY_TO_REGCLASS
2502 (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2503 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2504 FR64)>;
2505 def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
2506 (COPY_TO_REGCLASS
2507 (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
2508 (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
2509 FR64)>;
25102417 }
25112418
25122419 let Predicates = [HasAVX, NoVLX] in {
630630 ; SSE3_32_WIN-NEXT: subsd %xmm1, %xmm0
631631 ; SSE3_32_WIN-NEXT: andnpd %xmm0, %xmm3
632632 ; SSE3_32_WIN-NEXT: orpd %xmm3, %xmm2
633 ; SSE3_32_WIN-NEXT: movsd %xmm2, (%esp)
633 ; SSE3_32_WIN-NEXT: movlpd %xmm2, (%esp)
634634 ; SSE3_32_WIN-NEXT: fldl (%esp)
635635 ; SSE3_32_WIN-NEXT: fisttpll (%esp)
636636 ; SSE3_32_WIN-NEXT: setbe %dl
655655 ; SSE3_32_LIN-NEXT: subsd %xmm1, %xmm0
656656 ; SSE3_32_LIN-NEXT: andnpd %xmm0, %xmm3
657657 ; SSE3_32_LIN-NEXT: orpd %xmm3, %xmm2
658 ; SSE3_32_LIN-NEXT: movsd %xmm2, (%esp)
658 ; SSE3_32_LIN-NEXT: movlpd %xmm2, (%esp)
659659 ; SSE3_32_LIN-NEXT: fldl (%esp)
660660 ; SSE3_32_LIN-NEXT: fisttpll (%esp)
661661 ; SSE3_32_LIN-NEXT: setbe %dl
694694 ; SSE2_32_WIN-NEXT: andnpd %xmm2, %xmm4
695695 ; SSE2_32_WIN-NEXT: andpd %xmm0, %xmm3
696696 ; SSE2_32_WIN-NEXT: orpd %xmm4, %xmm3
697 ; SSE2_32_WIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
697 ; SSE2_32_WIN-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp)
698698 ; SSE2_32_WIN-NEXT: fldl {{[0-9]+}}(%esp)
699699 ; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
700700 ; SSE2_32_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
726726 ; SSE2_32_LIN-NEXT: andnpd %xmm2, %xmm4
727727 ; SSE2_32_LIN-NEXT: andpd %xmm0, %xmm3
728728 ; SSE2_32_LIN-NEXT: orpd %xmm4, %xmm3
729 ; SSE2_32_LIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
729 ; SSE2_32_LIN-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp)
730730 ; SSE2_32_LIN-NEXT: fldl {{[0-9]+}}(%esp)
731731 ; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
732732 ; SSE2_32_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1818 ; CHECK: %12:fr32 = VMULSSrr killed %11, killed %10
1919 ; CHECK: %14:fr32 = FsFLD0SS
2020 ; CHECK: %15:fr32 = VCMPSSrr %0, killed %14, 0
21 ; CHECK: %17:vr128 = VANDNPSrr killed %16, killed %13
21 ; CHECK: %17:vr128 = VPANDNrr killed %16, killed %13
2222 ; CHECK: $xmm0 = COPY %18
2323 ; CHECK: RET 0, $xmm0
2424 %call = tail call float @llvm.sqrt.f32(float %f) #1