llvm.org GIT mirror llvm / 5e40c8b
[ARM][NFCI] Do not fuse VADD and VMUL, continued (1/2) This is a follow up of rL342874, which stopped fusing muls and adds into VMLAs for performance reasons on the Cortex-M4 and Cortex-M33. This is a serie of 2 patches, that is trying to achieve the same for VFMA. The second column in the table below shows what we were generating before rL342874, the third column what changed with rL342874, and the last column what we want to achieve with these 2 patches: -------------------------------------------------------- | Opt | < rL342874 | >= rL342874 | | |------------------------------------------------------| |-O3 | vmla | vmul | vmul | | | | vadd | vadd | |------------------------------------------------------| |-Ofast | vfma | vfma | vmul | | | | | vadd | |------------------------------------------------------| |-Oz | vmla | vmla | vmla | -------------------------------------------------------- This patch 1/2, is a cleanup of the spaghetti predicate logic on the different VMLA and VFMA codegen rules, so that we can make the final functional change in patch 2/2. This also fixes a typo in the regression test added in rL342874. Differential revision: https://reviews.llvm.org/D53314 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344671 91177308-0d34-0410-b5e6-96231b3b80d8 Sjoerd Meijer 1 year, 4 months ago
4 changed file(s) with 45 addition(s) and 45 deletion(s). Raw diff Collapse all Expand all
356356 def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;
357357 def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
358358 def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
359 def UseFPVMLx : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">;
359
360 def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
361 " !TM.Options.AllowFPOpFusion == FPOpFusion::Fast) ||"
362 "MF->getFunction().optForMinSize())">;
360363 }
361364 def UseMulOps : Predicate<"Subtarget->useMulOps()">;
362365
367370 " FPOpFusion::Fast && "
368371 " Subtarget->hasVFP4()) && "
369372 "!Subtarget->isTargetDarwin()">;
370 def DontUseFusedMAC : Predicate<"!(TM.Options.AllowFPOpFusion =="
371 " FPOpFusion::Fast &&"
372 " Subtarget->hasVFP4()) || "
373 "Subtarget->isTargetDarwin()">;
374373
375374 def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
376375 def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
44014401 IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
44024402 def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
44034403 v2f32, fmul_su, fadd_mlx>,
4404 Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
4404 Requires<[HasNEON, UseFPVMLx]>;
44054405 def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
44064406 v4f32, fmul_su, fadd_mlx>,
4407 Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
4407 Requires<[HasNEON, UseFPVMLx]>;
44084408 def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16",
44094409 v4f16, fmul_su, fadd_mlx>,
4410 Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
4410 Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
44114411 def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16",
44124412 v8f16, fmul_su, fadd_mlx>,
4413 Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
4413 Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
44144414 defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
44154415 IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
44164416 def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
46314631 IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
46324632 def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
46334633 v2f32, fmul_su, fsub_mlx>,
4634 Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
4634 Requires<[HasNEON, UseFPVMLx]>;
46354635 def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
46364636 v4f32, fmul_su, fsub_mlx>,
4637 Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
4637 Requires<[HasNEON, UseFPVMLx]>;
46384638 def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16",
46394639 v4f16, fmul, fsub>,
4640 Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
4640 Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
46414641 def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16",
46424642 v8f16, fmul, fsub>,
4643 Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
4643 Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
46444644 defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
46454645 IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
46464646 def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
70837083 def : N3VSPat;
70847084 def : N3VSPat;
70857085 def : N3VSMulOpPat,
7086 Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
7086 Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
70877087 def : N3VSMulOpPat,
7088 Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
7088 Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
70897089 def : N3VSMulOpPat,
70907090 Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
70917091 def : N3VSMulOpPat,
18131813 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
18141814 (f64 DPR:$Ddin)))]>,
18151815 RegConstraint<"$Ddin = $Dd">,
1816 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
1816 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
18171817 Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
18181818
18191819 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
18221822 [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
18231823 SPR:$Sdin))]>,
18241824 RegConstraint<"$Sdin = $Sd">,
1825 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
1825 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
18261826 Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
18271827 // Some single precision VFP instructions may be executed on both NEON and
18281828 // VFP pipelines on A8.
18351835 [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
18361836 HPR:$Sdin))]>,
18371837 RegConstraint<"$Sdin = $Sd">,
1838 Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
1838 Requires<[HasFullFP16,UseFPVMLx]>;
18391839
18401840 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
18411841 (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
1842 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
1842 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
18431843 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
18441844 (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
1845 Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
1845 Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
18461846 def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
18471847 (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
1848 Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
1848 Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>;
18491849
18501850
18511851 def VMLSD : ADbI<0b11100, 0b00, 1, 0,
18541854 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
18551855 (f64 DPR:$Ddin)))]>,
18561856 RegConstraint<"$Ddin = $Dd">,
1857 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
1857 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
18581858 Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
18591859
18601860 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
18631863 [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
18641864 SPR:$Sdin))]>,
18651865 RegConstraint<"$Sdin = $Sd">,
1866 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
1866 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
18671867 Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
18681868 // Some single precision VFP instructions may be executed on both NEON and
18691869 // VFP pipelines on A8.
18761876 [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
18771877 HPR:$Sdin))]>,
18781878 RegConstraint<"$Sdin = $Sd">,
1879 Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
1879 Requires<[HasFullFP16,UseFPVMLx]>;
18801880
18811881 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
18821882 (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
1883 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
1883 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
18841884 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
18851885 (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
1886 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
1886 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
18871887 def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
18881888 (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
1889 Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
1889 Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
18901890
18911891 def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
18921892 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
18941894 [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
18951895 (f64 DPR:$Ddin)))]>,
18961896 RegConstraint<"$Ddin = $Dd">,
1897 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
1897 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
18981898 Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
18991899
19001900 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
19031903 [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
19041904 SPR:$Sdin))]>,
19051905 RegConstraint<"$Sdin = $Sd">,
1906 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
1906 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
19071907 Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
19081908 // Some single precision VFP instructions may be executed on both NEON and
19091909 // VFP pipelines on A8.
19161916 [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
19171917 HPR:$Sdin))]>,
19181918 RegConstraint<"$Sdin = $Sd">,
1919 Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
1919 Requires<[HasFullFP16,UseFPVMLx]>;
19201920
19211921 // (-(a * b) - dst) -> -(dst + (a * b))
19221922 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
19231923 (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
1924 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
1924 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
19251925 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
19261926 (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
1927 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
1927 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
19281928 def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
19291929 (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
1930 Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
1930 Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
19311931
19321932 // (-dst - (a * b)) -> -(dst + (a * b))
19331933 def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
19341934 (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
1935 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
1935 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
19361936 def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
19371937 (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
1938 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
1938 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
19391939 def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
19401940 (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
1941 Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
1941 Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
19421942
19431943 def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
19441944 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
19461946 [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
19471947 (f64 DPR:$Ddin)))]>,
19481948 RegConstraint<"$Ddin = $Dd">,
1949 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
1949 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
19501950 Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
19511951
19521952 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
19541954 IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
19551955 [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
19561956 RegConstraint<"$Sdin = $Sd">,
1957 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
1957 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
19581958 Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
19591959 // Some single precision VFP instructions may be executed on both NEON and
19601960 // VFP pipelines on A8.
19661966 IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
19671967 [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
19681968 RegConstraint<"$Sdin = $Sd">,
1969 Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
1969 Requires<[HasFullFP16,UseFPVMLx]>;
19701970
19711971 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
19721972 (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
1973 Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
1973 Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
19741974 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
19751975 (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
1976 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
1976 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
19771977 def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
19781978 (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
1979 Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
1979 Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
19801980
19811981 //===----------------------------------------------------------------------===//
19821982 // Fused FP Multiply-Accumulate Operations.
2626 ret float %1
2727 }
2828
29 define float @vlma_minsize(float %acc, float %a, float %b) #0 {
29 define float @vmla_minsize(float %acc, float %a, float %b) #0 {
3030 entry:
31 ; VMLA-LABEL: vlma_minsize:
32 ; VLMA: vmla.f32 s0, s1, s2
31 ; VMLA-LABEL: vmla_minsize:
32 ; VMLA: vmla.f32 s0, s1, s2
33 ; VMLA-NEXT: bx lr
3334
3435 %0 = fmul float %a, %b
3536 %1 = fadd float %acc, %0