llvm.org GIT mirror llvm / 857469d
[ARM] Do not fuse VADD and VMUL on the Cortex-M4 and Cortex-M33 A sequence of VMUL and VADD instructions always give the same or better performance than a fused VMLA instruction on the Cortex-M4 and Cortex-M33. Executing the VMUL and VADD back-to-back requires the same cycles, but having separate instructions allows scheduling to avoid the hazard between these 2 instructions. Differential Revision: https://reviews.llvm.org/D52289 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@342874 91177308-0d34-0410-b5e6-96231b3b80d8 Sjoerd Meijer 2 years ago
4 changed file(s) with 27 addition(s) and 7 deletion(s). Raw diff Collapse all Expand all
965965 FeatureVFPOnlySP,
966966 FeatureD16,
967967 FeaturePrefLoopAlign32,
968 FeatureHasSlowFPVMLx,
968969 FeatureHasNoBranchPredictor]>;
969970
970971 def : ProcNoItin<"cortex-m7", [ARMv7em,
980981 FeatureD16,
981982 FeatureVFPOnlySP,
982983 FeaturePrefLoopAlign32,
984 FeatureHasSlowFPVMLx,
983985 FeatureHasNoBranchPredictor]>;
984986
985987 def : ProcNoItin<"cortex-a32", [ARMv8a,
352352 let RecomputePerFunction = 1 in {
353353 def UseMovt : Predicate<"Subtarget->useMovt(*MF)">;
354354 def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;
355 def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
356 def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
357 }
358 def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
355 def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
356 def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
357 def UseFPVMLx : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">;
358 }
359359 def UseMulOps : Predicate<"Subtarget->useMulOps()">;
360360
361361 // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
22 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=A8
33 ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
44 ; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard %s -o - | FileCheck %s -check-prefix=HARD
5 ; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard %s -o - | FileCheck %s -check-prefix=VMLA
6 ; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m33 -float-abi=hard %s -o - | FileCheck %s -check-prefix=VMLA
57
68 define float @t1(float %acc, float %a, float %b) {
79 entry:
1416 ; A8-LABEL: t1:
1517 ; A8: vmul.f32
1618 ; A8: vadd.f32
19
20 ; VMLA-LABEL: t1:
21 ; VMLA: vmul.f32
22 ; VMLA-NEXT: vadd.f32
23
24 %0 = fmul float %a, %b
25 %1 = fadd float %acc, %0
26 ret float %1
27 }
28
29 define float @vlma_minsize(float %acc, float %a, float %b) #0 {
30 entry:
31 ; VMLA-LABEL: vlma_minsize:
32 ; VLMA: vmla.f32 s0, s1, s2
33
1734 %0 = fmul float %a, %b
1835 %1 = fadd float %acc, %0
1936 ret float %1
101118 %3 = fadd float %1, %2
102119 ret float %3
103120 }
121
122 attributes #0 = { minsize nounwind optsize }
0 ; RUN: llc < %s -mtriple=thumbv7-none-eabi -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE
1 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VMLA
1 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA
2 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m33 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA
23 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP -check-prefix=FP-ARMv8 -check-prefix=VMLA
34 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=FP-ARMv8 -check-prefix=VMLA
45 ; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4 -check-prefix=NO-VMLA
187188 ret float %1
188189 }
189190
190 ; FIXME: why does cortex-m4 use vmla, while cortex-a7 uses vmul+vadd?
191 ; (these should be equivalent, even the rounding is the same)
192191 declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
193192 define float @fmuladd_f(float %a, float %b, float %c) {
194193 ; CHECK-LABEL: fmuladd_f: