llvm.org GIT mirror llvm / 7c23aa2
[MachineCombiner] Support for floating-point FMA on ARM64 Evaluates fmul+fadd -> fmadd combines and similar code sequences in the machine combiner. It adds support for float and double similar to the existing integer implementation. The key features are: - DAGCombiner checks whether it should combine greedily or let the machine combiner do the evaluation. This is only supported on ARM64. - It gives preference to throughput over latency: the heuristic used is to combine always in loops. The targets decides whether the machine combiner should optimize for throughput or latency. - Supports for fmadd, f(n)msub, fmla, fmls patterns - On by default at O3 ffast-math git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@267098 91177308-0d34-0410-b5e6-96231b3b80d8 Gerolf Hoflehner 3 years ago
12 changed file(s) with 895 addition(s) and 42 deletion(s). Raw diff Collapse all Expand all
3737 MULSUBX_OP1,
3838 MULSUBX_OP2,
3939 MULADDXI_OP1,
40 MULSUBXI_OP1
40 MULSUBXI_OP1,
41 // Floating Point
42 FMULADDS_OP1,
43 FMULADDS_OP2,
44 FMULSUBS_OP1,
45 FMULSUBS_OP2,
46 FMULADDD_OP1,
47 FMULADDD_OP2,
48 FMULSUBD_OP1,
49 FMULSUBD_OP2,
50 FMLAv1i32_indexed_OP1,
51 FMLAv1i32_indexed_OP2,
52 FMLAv1i64_indexed_OP1,
53 FMLAv1i64_indexed_OP2,
54 FMLAv2f32_OP2,
55 FMLAv2f32_OP1,
56 FMLAv2f64_OP1,
57 FMLAv2f64_OP2,
58 FMLAv2i32_indexed_OP1,
59 FMLAv2i32_indexed_OP2,
60 FMLAv2i64_indexed_OP1,
61 FMLAv2i64_indexed_OP2,
62 FMLAv4f32_OP1,
63 FMLAv4f32_OP2,
64 FMLAv4i32_indexed_OP1,
65 FMLAv4i32_indexed_OP2,
66 FMLSv1i32_indexed_OP2,
67 FMLSv1i64_indexed_OP2,
68 FMLSv2i32_indexed_OP2,
69 FMLSv2i64_indexed_OP2,
70 FMLSv2f32_OP2,
71 FMLSv2f64_OP2,
72 FMLSv4i32_indexed_OP2,
73 FMLSv4f32_OP2
4174 };
4275
4376 } // end namespace llvm
1616 #define LLVM_CODEGEN_SELECTIONDAGTARGETINFO_H
1717
1818 #include "llvm/CodeGen/SelectionDAGNodes.h"
19 #include "llvm/Support/CodeGen.h"
1920
2021 namespace llvm {
2122
137138 MachinePointerInfo SrcPtrInfo) const {
138139 return std::make_pair(SDValue(), SDValue());
139140 }
141 // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather
142 // than FMUL and ADD is delegated to the machine combiner.
143 virtual bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const {
144 return false;
145 }
140146 };
141147
142148 } // end llvm namespace
817817 MachineInstr &Root,
818818 SmallVectorImpl &Patterns) const;
819819
820 /// Return true when a code sequence can improve throughput. It
821 /// should be called only for instructions in loops.
822 /// \param Pattern - combiner pattern
823 virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const;
824
820825 /// Return true if the input \P Inst is part of a chain of dependent ops
821826 /// that are suitable for reassociation, otherwise return false.
822827 /// If the instruction's operands must be commuted to have a previous
3939 const TargetRegisterInfo *TRI;
4040 MCSchedModel SchedModel;
4141 MachineRegisterInfo *MRI;
42 MachineLoopInfo *MLI; // Current MachineLoopInfo
4243 MachineTraceMetrics *Traces;
4344 MachineTraceMetrics::Ensemble *MinInstr;
4445
8586
8687 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
8788 "Machine InstCombiner", false, false)
89 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
8890 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
8991 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
9092 false, false)
9294 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
9395 AU.setPreservesCFG();
9496 AU.addPreserved();
97 AU.addRequired();
9598 AU.addPreserved();
9699 AU.addRequired();
97100 AU.addPreserved();
353356 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
354357
355358 auto BlockIter = MBB->begin();
359 // Check if the block is in a loop.
360 const MachineLoop *ML = MLI->getLoopFor(MBB);
356361
357362 while (BlockIter != MBB->end()) {
358363 auto &MI = *BlockIter++;
405410 if (!NewInstCount)
406411 continue;
407412
413 bool SubstituteAlways = false;
414 if (ML && TII->isThroughputPattern(P))
415 SubstituteAlways = true;
416
408417 // Substitute when we optimize for codesize and the new sequence has
409418 // fewer instructions OR
410419 // the new sequence neither lengthens the critical path nor increases
411420 // resource pressure.
412 if (doSubstitute(NewInstCount, OldInstCount) ||
421 if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
413422 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
414423 InstrIdxForVirtReg, P) &&
415424 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
446455 SchedModel = STI.getSchedModel();
447456 TSchedModel.init(SchedModel, &STI, TII);
448457 MRI = &MF.getRegInfo();
458 MLI = &getAnalysis();
449459 Traces = &getAnalysis();
450460 MinInstr = nullptr;
451461 OptSize = MF.getFunction()->optForSize();
2323 #include "llvm/Analysis/AliasAnalysis.h"
2424 #include "llvm/CodeGen/MachineFrameInfo.h"
2525 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
2627 #include "llvm/IR/DataLayout.h"
2728 #include "llvm/IR/DerivedTypes.h"
2829 #include "llvm/IR/Function.h"
8485
8586 class DAGCombiner {
8687 SelectionDAG &DAG;
88 const SelectionDAGTargetInfo &STI;
8789 const TargetLowering &TLI;
8890 CombineLevel Level;
8991 CodeGenOpt::Level OptLevel;
468470
469471 public:
470472 DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
471 : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
472 OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
473 : DAG(D), STI(D.getSelectionDAGInfo()), TLI(D.getTargetLoweringInfo()),
474 Level(BeforeLegalizeTypes), OptLevel(OL), LegalOperations(false),
475 LegalTypes(false), AA(A) {
473476 ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
474477 }
475478
77147717 if (!HasFMAD && !HasFMA)
77157718 return SDValue();
77167719
7720 if (AllowFusion && STI.GenerateFMAsInMachineCombiner(OptLevel))
7721 return SDValue();
7722
77177723 // Always prefer FMAD to FMA for precision.
77187724 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
77197725 bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
78957901
78967902 // No valid opcode, do not combine.
78977903 if (!HasFMAD && !HasFMA)
7904 return SDValue();
7905
7906 if (AllowFusion && STI.GenerateFMAsInMachineCombiner(OptLevel))
78987907 return SDValue();
78997908
79007909 // Always prefer FMAD to FMA for precision.
654654
655655 return false;
656656 }
657
657 /// Return true when a code sequence can improve loop throughput.
658 bool
659 TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
660 return false;
661 }
658662 /// Attempt the reassociation transformation to reduce critical path length.
659663 /// See the above comments before getMachineCombinerPatterns().
660664 void TargetInstrInfo::reassociateOps(
27872787 return false;
27882788 }
27892789 //
2790 // FP Opcodes that can be combined with a FMUL
2791 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
2792 switch (Inst.getOpcode()) {
2793 case AArch64::FADDSrr:
2794 case AArch64::FADDDrr:
2795 case AArch64::FADDv2f32:
2796 case AArch64::FADDv2f64:
2797 case AArch64::FADDv4f32:
2798 case AArch64::FSUBSrr:
2799 case AArch64::FSUBDrr:
2800 case AArch64::FSUBv2f32:
2801 case AArch64::FSUBv2f64:
2802 case AArch64::FSUBv4f32:
2803 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
2804 default:
2805 break;
2806 }
2807 return false;
2808 }
2809 //
27902810 // Opcodes that can be combined with a MUL
27912811 static bool isCombineInstrCandidate(unsigned Opc) {
27922812 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
27932813 }
27942814
2795 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
2796 unsigned MulOpc, unsigned ZeroReg) {
2815 //
2816 // Utility routine that checks if \param MO is defined by an
2817 // \param CombineOpc instruction in the basic block \param MBB
2818 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
2819 unsigned CombineOpc, unsigned ZeroReg = 0,
2820 bool CheckZeroReg = false) {
27972821 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
27982822 MachineInstr *MI = nullptr;
2799 // We need a virtual register definition.
2823
28002824 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
28012825 MI = MRI.getUniqueVRegDef(MO.getReg());
28022826 // And it needs to be in the trace (otherwise, it won't have a depth).
2803 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
2804 return false;
2805
2806 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
2807 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
2808 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
2809
2810 // The third input reg must be zero.
2811 if (MI->getOperand(3).getReg() != ZeroReg)
2812 return false;
2813
2827 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
2828 return false;
28142829 // Must only used by the user we combine with.
28152830 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
28162831 return false;
28172832
2833 if (CheckZeroReg) {
2834 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
2835 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
2836 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
2837 // The third input reg must be zero.
2838 if (MI->getOperand(3).getReg() != ZeroReg)
2839 return false;
2840 }
2841
28182842 return true;
2843 }
2844
2845 //
2846 // Is \param MO defined by an integer multiply and can be combined?
2847 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
2848 unsigned MulOpc, unsigned ZeroReg) {
2849 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
2850 }
2851
2852 //
2853 // Is \param MO defined by a floating-point multiply and can be combined?
2854 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
2855 unsigned MulOpc) {
2856 return canCombine(MBB, MO, MulOpc);
28192857 }
28202858
28212859 // TODO: There are many more machine instruction opcodes to match:
29512989 }
29522990 return Found;
29532991 }
2954
2992 /// Floating-Point Support
2993
2994 /// Find instructions that can be turned into madd.
2995 static bool getFMAPatterns(MachineInstr &Root,
2996 SmallVectorImpl &Patterns) {
2997
2998 if (!isCombineInstrCandidateFP(Root))
2999 return 0;
3000
3001 MachineBasicBlock &MBB = *Root.getParent();
3002 bool Found = false;
3003
3004 switch (Root.getOpcode()) {
3005 default:
3006 assert(false && "Unsupported FP instruction in combiner\n");
3007 break;
3008 case AArch64::FADDSrr:
3009 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3010 "FADDWrr does not have register operands");
3011 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3012 Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
3013 Found = true;
3014 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3015 AArch64::FMULv1i32_indexed)) {
3016 Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
3017 Found = true;
3018 }
3019 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3020 Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
3021 Found = true;
3022 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3023 AArch64::FMULv1i32_indexed)) {
3024 Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
3025 Found = true;
3026 }
3027 break;
3028 case AArch64::FADDDrr:
3029 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3030 Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
3031 Found = true;
3032 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3033 AArch64::FMULv1i64_indexed)) {
3034 Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
3035 Found = true;
3036 }
3037 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3038 Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
3039 Found = true;
3040 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3041 AArch64::FMULv1i64_indexed)) {
3042 Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
3043 Found = true;
3044 }
3045 break;
3046 case AArch64::FADDv2f32:
3047 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3048 AArch64::FMULv2i32_indexed)) {
3049 Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
3050 Found = true;
3051 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3052 AArch64::FMULv2f32)) {
3053 Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
3054 Found = true;
3055 }
3056 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3057 AArch64::FMULv2i32_indexed)) {
3058 Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
3059 Found = true;
3060 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3061 AArch64::FMULv2f32)) {
3062 Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
3063 Found = true;
3064 }
3065 break;
3066 case AArch64::FADDv2f64:
3067 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3068 AArch64::FMULv2i64_indexed)) {
3069 Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
3070 Found = true;
3071 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3072 AArch64::FMULv2f64)) {
3073 Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
3074 Found = true;
3075 }
3076 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3077 AArch64::FMULv2i64_indexed)) {
3078 Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
3079 Found = true;
3080 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3081 AArch64::FMULv2f64)) {
3082 Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
3083 Found = true;
3084 }
3085 break;
3086 case AArch64::FADDv4f32:
3087 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3088 AArch64::FMULv4i32_indexed)) {
3089 Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
3090 Found = true;
3091 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3092 AArch64::FMULv4f32)) {
3093 Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
3094 Found = true;
3095 }
3096 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3097 AArch64::FMULv4i32_indexed)) {
3098 Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
3099 Found = true;
3100 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3101 AArch64::FMULv4f32)) {
3102 Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
3103 Found = true;
3104 }
3105 break;
3106
3107 case AArch64::FSUBSrr:
3108 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3109 Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
3110 Found = true;
3111 }
3112 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3113 Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
3114 Found = true;
3115 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3116 AArch64::FMULv1i32_indexed)) {
3117 Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
3118 Found = true;
3119 }
3120 break;
3121 case AArch64::FSUBDrr:
3122 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3123 Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
3124 Found = true;
3125 }
3126 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3127 Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
3128 Found = true;
3129 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3130 AArch64::FMULv1i64_indexed)) {
3131 Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
3132 Found = true;
3133 }
3134 break;
3135 case AArch64::FSUBv2f32:
3136 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3137 AArch64::FMULv2i32_indexed)) {
3138 Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
3139 Found = true;
3140 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3141 AArch64::FMULv2f32)) {
3142 Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
3143 Found = true;
3144 }
3145 break;
3146 case AArch64::FSUBv2f64:
3147 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3148 AArch64::FMULv2i64_indexed)) {
3149 Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
3150 Found = true;
3151 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3152 AArch64::FMULv2f64)) {
3153 Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
3154 Found = true;
3155 }
3156 break;
3157 case AArch64::FSUBv4f32:
3158 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3159 AArch64::FMULv4i32_indexed)) {
3160 Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
3161 Found = true;
3162 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3163 AArch64::FMULv4f32)) {
3164 Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
3165 Found = true;
3166 }
3167 break;
3168 }
3169 return Found;
3170 }
3171
3172 /// Return true when a code sequence can improve throughput. It
3173 /// should be called only for instructions in loops.
3174 /// \param Pattern - combiner pattern
3175 bool
3176 AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
3177 switch (Pattern) {
3178 default:
3179 break;
3180 case MachineCombinerPattern::FMULADDS_OP1:
3181 case MachineCombinerPattern::FMULADDS_OP2:
3182 case MachineCombinerPattern::FMULSUBS_OP1:
3183 case MachineCombinerPattern::FMULSUBS_OP2:
3184 case MachineCombinerPattern::FMULADDD_OP1:
3185 case MachineCombinerPattern::FMULADDD_OP2:
3186 case MachineCombinerPattern::FMULSUBD_OP1:
3187 case MachineCombinerPattern::FMULSUBD_OP2:
3188 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
3189 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
3190 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
3191 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
3192 case MachineCombinerPattern::FMLAv2f32_OP2:
3193 case MachineCombinerPattern::FMLAv2f32_OP1:
3194 case MachineCombinerPattern::FMLAv2f64_OP1:
3195 case MachineCombinerPattern::FMLAv2f64_OP2:
3196 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
3197 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
3198 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
3199 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
3200 case MachineCombinerPattern::FMLAv4f32_OP1:
3201 case MachineCombinerPattern::FMLAv4f32_OP2:
3202 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
3203 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
3204 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
3205 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
3206 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
3207 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
3208 case MachineCombinerPattern::FMLSv2f32_OP2:
3209 case MachineCombinerPattern::FMLSv2f64_OP2:
3210 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
3211 case MachineCombinerPattern::FMLSv4f32_OP2:
3212 return true;
3213 } // end switch (Pattern)
3214 return false;
3215 }
29553216 /// Return true when there is potentially a faster code sequence for an
29563217 /// instruction chain ending in \p Root. All potential patterns are listed in
29573218 /// the \p Pattern vector. Pattern should be sorted in priority order since the
29603221 bool AArch64InstrInfo::getMachineCombinerPatterns(
29613222 MachineInstr &Root,
29623223 SmallVectorImpl &Patterns) const {
3224 // Integer patterns
29633225 if (getMaddPatterns(Root, Patterns))
29643226 return true;
3227 // Floating point patterns
3228 if (getFMAPatterns(Root, Patterns))
3229 return true;
29653230
29663231 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
29673232 }
29683233
2969 /// genMadd - Generate madd instruction and combine mul and add.
2970 /// Example:
2971 /// MUL I=A,B,0
2972 /// ADD R,I,C
2973 /// ==> MADD R,A,B,C
2974 /// \param Root is the ADD instruction
3234 enum class FMAInstKind { Default, Indexed, Accumulator };
3235 /// genFusedMultiply - Generate fused multiply instructions.
3236 /// This function supports both integer and floating point instructions.
3237 /// A typical example:
3238 /// F|MUL I=A,B,0
3239 /// F|ADD R,I,C
3240 /// ==> F|MADD R,A,B,C
3241 /// \param Root is the F|ADD instruction
29753242 /// \param [out] InsInstrs is a vector of machine instructions and will
29763243 /// contain the generated madd instruction
29773244 /// \param IdxMulOpd is index of operand in Root that is the result of
2978 /// the MUL. In the example above IdxMulOpd is 1.
2979 /// \param MaddOpc the opcode fo the madd instruction
2980 static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
2981 const TargetInstrInfo *TII, MachineInstr &Root,
2982 SmallVectorImpl &InsInstrs,
2983 unsigned IdxMulOpd, unsigned MaddOpc,
2984 const TargetRegisterClass *RC) {
3245 /// the F|MUL. In the example above IdxMulOpd is 1.
3246 /// \param MaddOpc the opcode fo the f|madd instruction
3247 static MachineInstr *
3248 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
3249 const TargetInstrInfo *TII, MachineInstr &Root,
3250 SmallVectorImpl &InsInstrs, unsigned IdxMulOpd,
3251 unsigned MaddOpc, const TargetRegisterClass *RC,
3252 FMAInstKind kind = FMAInstKind::Default) {
29853253 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
29863254
29873255 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
30033271 if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
30043272 MRI.constrainRegClass(SrcReg2, RC);
30053273
3006 MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
3007 ResultReg)
3008 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3009 .addReg(SrcReg1, getKillRegState(Src1IsKill))
3010 .addReg(SrcReg2, getKillRegState(Src2IsKill));
3011 // Insert the MADD
3274 MachineInstrBuilder MIB;
3275 if (kind == FMAInstKind::Default)
3276 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3277 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3278 .addReg(SrcReg1, getKillRegState(Src1IsKill))
3279 .addReg(SrcReg2, getKillRegState(Src2IsKill));
3280 else if (kind == FMAInstKind::Indexed)
3281 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3282 .addReg(SrcReg2, getKillRegState(Src2IsKill))
3283 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3284 .addReg(SrcReg1, getKillRegState(Src1IsKill))
3285 .addImm(MUL->getOperand(3).getImm());
3286 else if (kind == FMAInstKind::Accumulator)
3287 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3288 .addReg(SrcReg2, getKillRegState(Src2IsKill))
3289 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3290 .addReg(SrcReg1, getKillRegState(Src1IsKill));
3291 else
3292 assert(false && "Invalid FMA instruction kind \n");
3293 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
30123294 InsInstrs.push_back(MIB);
30133295 return MUL;
30143296 }
30963378 Opc = AArch64::MADDXrrr;
30973379 RC = &AArch64::GPR64RegClass;
30983380 }
3099 MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
3381 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
31003382 break;
31013383 case MachineCombinerPattern::MULADDW_OP2:
31023384 case MachineCombinerPattern::MULADDX_OP2:
31113393 Opc = AArch64::MADDXrrr;
31123394 RC = &AArch64::GPR64RegClass;
31133395 }
3114 MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
3396 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
31153397 break;
31163398 case MachineCombinerPattern::MULADDWI_OP1:
31173399 case MachineCombinerPattern::MULADDXI_OP1: {
32033485 Opc = AArch64::MSUBXrrr;
32043486 RC = &AArch64::GPR64RegClass;
32053487 }
3206 MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
3488 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
32073489 break;
32083490 case MachineCombinerPattern::MULSUBWI_OP1:
32093491 case MachineCombinerPattern::MULSUBXI_OP1: {
32453527 InsInstrs.push_back(MIB1);
32463528 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
32473529 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
3530 }
3531 break;
3532 }
3533 // Floating Point Support
3534 case MachineCombinerPattern::FMULADDS_OP1:
3535 case MachineCombinerPattern::FMULADDD_OP1:
3536 // MUL I=A,B,0
3537 // ADD R,I,C
3538 // ==> MADD R,A,B,C
3539 // --- Create(MADD);
3540 if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
3541 Opc = AArch64::FMADDSrrr;
3542 RC = &AArch64::FPR32RegClass;
3543 } else {
3544 Opc = AArch64::FMADDDrrr;
3545 RC = &AArch64::FPR64RegClass;
3546 }
3547 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
3548 break;
3549 case MachineCombinerPattern::FMULADDS_OP2:
3550 case MachineCombinerPattern::FMULADDD_OP2:
3551 // FMUL I=A,B,0
3552 // FADD R,C,I
3553 // ==> FMADD R,A,B,C
3554 // --- Create(FMADD);
3555 if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
3556 Opc = AArch64::FMADDSrrr;
3557 RC = &AArch64::FPR32RegClass;
3558 } else {
3559 Opc = AArch64::FMADDDrrr;
3560 RC = &AArch64::FPR64RegClass;
3561 }
3562 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
3563 break;
3564
3565 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
3566 Opc = AArch64::FMLAv1i32_indexed;
3567 RC = &AArch64::FPR32RegClass;
3568 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3569 FMAInstKind::Indexed);
3570 break;
3571 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
3572 Opc = AArch64::FMLAv1i32_indexed;
3573 RC = &AArch64::FPR32RegClass;
3574 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3575 FMAInstKind::Indexed);
3576 break;
3577
3578 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
3579 Opc = AArch64::FMLAv1i64_indexed;
3580 RC = &AArch64::FPR64RegClass;
3581 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3582 FMAInstKind::Indexed);
3583 break;
3584 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
3585 Opc = AArch64::FMLAv1i64_indexed;
3586 RC = &AArch64::FPR64RegClass;
3587 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3588 FMAInstKind::Indexed);
3589 break;
3590
3591 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
3592 case MachineCombinerPattern::FMLAv2f32_OP1:
3593 RC = &AArch64::FPR64RegClass;
3594 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
3595 Opc = AArch64::FMLAv2i32_indexed;
3596 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3597 FMAInstKind::Indexed);
3598 } else {
3599 Opc = AArch64::FMLAv2f32;
3600 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3601 FMAInstKind::Accumulator);
3602 }
3603 break;
3604 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
3605 case MachineCombinerPattern::FMLAv2f32_OP2:
3606 RC = &AArch64::FPR64RegClass;
3607 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
3608 Opc = AArch64::FMLAv2i32_indexed;
3609 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3610 FMAInstKind::Indexed);
3611 } else {
3612 Opc = AArch64::FMLAv2f32;
3613 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3614 FMAInstKind::Accumulator);
3615 }
3616 break;
3617
3618 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
3619 case MachineCombinerPattern::FMLAv2f64_OP1:
3620 RC = &AArch64::FPR128RegClass;
3621 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
3622 Opc = AArch64::FMLAv2i64_indexed;
3623 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3624 FMAInstKind::Indexed);
3625 } else {
3626 Opc = AArch64::FMLAv2f64;
3627 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3628 FMAInstKind::Accumulator);
3629 }
3630 break;
3631 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
3632 case MachineCombinerPattern::FMLAv2f64_OP2:
3633 RC = &AArch64::FPR128RegClass;
3634 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
3635 Opc = AArch64::FMLAv2i64_indexed;
3636 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3637 FMAInstKind::Indexed);
3638 } else {
3639 Opc = AArch64::FMLAv2f64;
3640 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3641 FMAInstKind::Accumulator);
3642 }
3643 break;
3644
3645 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
3646 case MachineCombinerPattern::FMLAv4f32_OP1:
3647 RC = &AArch64::FPR128RegClass;
3648 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
3649 Opc = AArch64::FMLAv4i32_indexed;
3650 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3651 FMAInstKind::Indexed);
3652 } else {
3653 Opc = AArch64::FMLAv4f32;
3654 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3655 FMAInstKind::Accumulator);
3656 }
3657 break;
3658
3659 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
3660 case MachineCombinerPattern::FMLAv4f32_OP2:
3661 RC = &AArch64::FPR128RegClass;
3662 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
3663 Opc = AArch64::FMLAv4i32_indexed;
3664 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3665 FMAInstKind::Indexed);
3666 } else {
3667 Opc = AArch64::FMLAv4f32;
3668 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3669 FMAInstKind::Accumulator);
3670 }
3671 break;
3672
3673 case MachineCombinerPattern::FMULSUBS_OP1:
3674 case MachineCombinerPattern::FMULSUBD_OP1: {
3675 // FMUL I=A,B,0
3676 // FSUB R,I,C
3677 // ==> FNMSUB R,A,B,C // = -C + A*B
3678 // --- Create(FNMSUB);
3679 if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
3680 Opc = AArch64::FNMSUBSrrr;
3681 RC = &AArch64::FPR32RegClass;
3682 } else {
3683 Opc = AArch64::FNMSUBDrrr;
3684 RC = &AArch64::FPR64RegClass;
3685 }
3686 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
3687 break;
3688 }
3689 case MachineCombinerPattern::FMULSUBS_OP2:
3690 case MachineCombinerPattern::FMULSUBD_OP2: {
3691 // FMUL I=A,B,0
3692 // FSUB R,C,I
3693 // ==> FMSUB R,A,B,C (computes C - A*B)
3694 // --- Create(FMSUB);
3695 if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
3696 Opc = AArch64::FMSUBSrrr;
3697 RC = &AArch64::FPR32RegClass;
3698 } else {
3699 Opc = AArch64::FMSUBDrrr;
3700 RC = &AArch64::FPR64RegClass;
3701 }
3702 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
3703 break;
3704
3705 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
3706 Opc = AArch64::FMLSv1i32_indexed;
3707 RC = &AArch64::FPR32RegClass;
3708 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3709 FMAInstKind::Indexed);
3710 break;
3711
3712 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
3713 Opc = AArch64::FMLSv1i64_indexed;
3714 RC = &AArch64::FPR64RegClass;
3715 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3716 FMAInstKind::Indexed);
3717 break;
3718
3719 case MachineCombinerPattern::FMLSv2f32_OP2:
3720 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
3721 RC = &AArch64::FPR64RegClass;
3722 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
3723 Opc = AArch64::FMLSv2i32_indexed;
3724 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3725 FMAInstKind::Indexed);
3726 } else {
3727 Opc = AArch64::FMLSv2f32;
3728 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3729 FMAInstKind::Accumulator);
3730 }
3731 break;
3732
3733 case MachineCombinerPattern::FMLSv2f64_OP2:
3734 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
3735 RC = &AArch64::FPR128RegClass;
3736 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
3737 Opc = AArch64::FMLSv2i64_indexed;
3738 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3739 FMAInstKind::Indexed);
3740 } else {
3741 Opc = AArch64::FMLSv2f64;
3742 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3743 FMAInstKind::Accumulator);
3744 }
3745 break;
3746
3747 case MachineCombinerPattern::FMLSv4f32_OP2:
3748 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
3749 RC = &AArch64::FPR128RegClass;
3750 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
3751 Opc = AArch64::FMLSv4i32_indexed;
3752 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3753 FMAInstKind::Indexed);
3754 } else {
3755 Opc = AArch64::FMLSv4f32;
3756 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3757 FMAInstKind::Accumulator);
32483758 }
32493759 break;
32503760 }
173173 unsigned SrcReg2, int CmpMask, int CmpValue,
174174 const MachineRegisterInfo *MRI) const override;
175175 bool optimizeCondBranch(MachineInstr *MI) const override;
176
177 /// Return true when a code sequence can improve throughput. It
178 /// should be called only for instructions in loops.
179 /// \param Pattern - combiner pattern
180 bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
176181 /// Return true when there is potentially a faster code sequence
177182 /// for an instruction chain ending in . All potential patterns are
178183 /// listed in the array.
5050 }
5151 return SDValue();
5252 }
53 bool AArch64SelectionDAGInfo::GenerateFMAsInMachineCombiner(
54 CodeGenOpt::Level OptLevel) const {
55 if (OptLevel >= CodeGenOpt::Aggressive)
56 return true;
57 return false;
58 }
2424 SDValue Dst, SDValue Src, SDValue Size,
2525 unsigned Align, bool isVolatile,
2626 MachinePointerInfo DstPtrInfo) const override;
27 bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
2728 };
2829 }
2930
0 ; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
1 define void @foo_2d(double* %src) {
2 ; CHECK-LABEL: %entry
3 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
4 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
5 entry:
6 %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
7 %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
8 %tmp = bitcast double* %arrayidx1 to <2 x double>*
9 %tmp1 = load double, double* %arrayidx2, align 8
10 %tmp2 = load double, double* %arrayidx1, align 8
11 %fmul = fmul fast double %tmp1, %tmp1
12 %fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B
13 %fadd = fadd fast double %fmul, %fmul2
14 br label %for.body
15
16 ; CHECK-LABEL: %for.body
17 ; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
18 ; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
19 ; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
20 for.body: ; preds = %for.body, %entry
21 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
22 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
23 %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
24 %tmp3 = load double, double* %arrayidx3, align 8
25 %add = fadd fast double %tmp3, %tmp3
26 %mul = fmul fast double %add, %fadd
27 %e1 = insertelement <2 x double> undef, double %add, i32 0
28 %e2 = insertelement <2 x double> %e1, double %add, i32 1
29 %add2 = fadd fast <2 x double> %e2,
30 %e3 = insertelement <2 x double> undef, double %mul, i32 0
31 %e4 = insertelement <2 x double> %e3, double %mul, i32 1
32 %mul2 = fmul fast <2 x double> %add2,
33 %e5 = insertelement <2 x double> undef, double %add, i32 0
34 %e6 = insertelement <2 x double> %e5, double %add, i32 1
35 %add3 = fadd fast <2 x double> %mul2,
36 %mulx = fmul fast <2 x double> %add2, %e2
37 %addx = fadd fast <2 x double> %mulx, %e4
38 %e7 = insertelement <2 x double> undef, double %mul, i32 0
39 %e8 = insertelement <2 x double> %e7, double %mul, i32 1
40 %e9 = fmul fast <2 x double> %addx, %add3
41 store <2 x double> %e9, <2 x double>* %tmp, align 8
42 %e10 = extractelement <2 x double> %add3, i32 0
43 %mul3 = fmul fast double %mul, %e10
44 %add4 = fadd fast double %mul3, %mul
45 store double %add4, double* %arrayidx2, align 8
46 %exitcond = icmp eq i64 %indvars.iv.next, 25
47 br i1 %exitcond, label %for.end, label %for.body
48
49 for.end: ; preds = %for.body
50 ret void
51 }
52 define void @foo_2s(float* %src) {
53 entry:
54 %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
55 %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
56 %tmp = bitcast float* %arrayidx1 to <2 x float>*
57 br label %for.body
58
59 ; CHECK-LABEL: %for.body
60 ; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
61 ; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
62 ; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
63 for.body: ; preds = %for.body, %entry
64 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
65 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
66 %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
67 %tmp1 = load float, float* %arrayidx3, align 8
68 %add = fadd fast float %tmp1, %tmp1
69 %mul = fmul fast float %add, %add
70 %e1 = insertelement <2 x float> undef, float %add, i32 0
71 %e2 = insertelement <2 x float> %e1, float %add, i32 1
72 %add2 = fadd fast <2 x float> %e2,
73 %e3 = insertelement <2 x float> undef, float %mul, i32 0
74 %e4 = insertelement <2 x float> %e3, float %mul, i32 1
75 %mul2 = fmul fast <2 x float> %add2,
76 %e5 = insertelement <2 x float> undef, float %add, i32 0
77 %e6 = insertelement <2 x float> %e5, float %add, i32 1
78 %add3 = fadd fast <2 x float> %mul2,
79 %mulx = fmul fast <2 x float> %add2, %e2
80 %addx = fadd fast <2 x float> %mulx, %e4
81 %e7 = insertelement <2 x float> undef, float %mul, i32 0
82 %e8 = insertelement <2 x float> %e7, float %mul, i32 1
83 %e9 = fmul fast <2 x float> %addx, %add3
84 store <2 x float> %e9, <2 x float>* %tmp, align 8
85 %e10 = extractelement <2 x float> %add3, i32 0
86 %mul3 = fmul fast float %mul, %e10
87 %add4 = fadd fast float %mul3, %mul
88 store float %add4, float* %arrayidx2, align 8
89 %exitcond = icmp eq i64 %indvars.iv.next, 25
90 br i1 %exitcond, label %for.end, label %for.body
91
92 for.end: ; preds = %for.body
93 ret void
94 }
95 define void @foo_4s(float* %src) {
96 entry:
97 %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
98 %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
99 %tmp = bitcast float* %arrayidx1 to <4 x float>*
100 br label %for.body
101
102 ; CHECK-LABEL: %for.body
103 ; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
104 ; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
105 for.body: ; preds = %for.body, %entry
106 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
107 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
108 %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
109 %tmp1 = load float, float* %arrayidx3, align 8
110 %add = fadd fast float %tmp1, %tmp1
111 %mul = fmul fast float %add, %add
112 %e1 = insertelement <4 x float> undef, float %add, i32 0
113 %e2 = insertelement <4 x float> %e1, float %add, i32 1
114 %add2 = fadd fast <4 x float> %e2,
115 %e3 = insertelement <4 x float> undef, float %mul, i32 0
116 %e4 = insertelement <4 x float> %e3, float %mul, i32 1
117 %mul2 = fmul fast <4 x float> %add2,
118 %e5 = insertelement <4 x float> undef, float %add, i32 0
119 %e6 = insertelement <4 x float> %e5, float %add, i32 1
120 %add3 = fadd fast <4 x float> %mul2,
121 %mulx = fmul fast <4 x float> %add2, %e2
122 %addx = fadd fast <4 x float> %mulx, %e4
123 %e7 = insertelement <4 x float> undef, float %mul, i32 0
124 %e8 = insertelement <4 x float> %e7, float %mul, i32 1
125 %e9 = fmul fast <4 x float> %addx, %add3
126 store <4 x float> %e9, <4 x float>* %tmp, align 8
127 %e10 = extractelement <4 x float> %add3, i32 0
128 %mul3 = fmul fast float %mul, %e10
129 store float %mul3, float* %arrayidx2, align 8
130 %exitcond = icmp eq i64 %indvars.iv.next, 25
131 br i1 %exitcond, label %for.end, label %for.body
132
133 for.end: ; preds = %for.body
134 ret void
135 }
0 ; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
1 define void @foo_2d(double* %src) {
2 entry:
3 %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
4 %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
5 %tmp = bitcast double* %arrayidx1 to <2 x double>*
6 br label %for.body
7
8 ; CHECK-LABEL: %for.body
9 ; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
10 ; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
11 ; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
12 for.body: ; preds = %for.body, %entry
13 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
14 %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
15 %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
16 %tmp1 = load double, double* %arrayidx3, align 8
17 %add = fadd fast double %tmp1, %tmp1
18 %mul = fmul fast double %add, %add
19 %e1 = insertelement <2 x double> undef, double %add, i32 0
20 %e2 = insertelement <2 x double> %e1, double %add, i32 1
21 %sub2 = fsub fast <2 x double> %e2,
22 %e3 = insertelement <2 x double> undef, double %mul, i32 0
23 %e4 = insertelement <2 x double> %e3, double %mul, i32 1
24 %mul2 = fmul fast <2 x double> %sub2,
25 %e5 = insertelement <2 x double> undef, double %add, i32 0
26 %e6 = insertelement <2 x double> %e5, double %add, i32 1
27 %sub3 = fsub fast <2 x double> , %mul2
28 %mulx = fmul fast <2 x double> %sub2, %e2
29 %subx = fsub fast <2 x double> %e4, %mulx
30 %e7 = insertelement <2 x double> undef, double %mul, i32 0
31 %e8 = insertelement <2 x double> %e7, double %mul, i32 1
32 %e9 = fmul fast <2 x double> %subx, %sub3
33 store <2 x double> %e9, <2 x double>* %tmp, align 8
34 %e10 = extractelement <2 x double> %sub3, i32 0
35 %mul3 = fmul fast double %mul, %e10
36 %sub4 = fsub fast double %mul, %mul3
37 store double %sub4, double* %arrayidx2, align 8
38 %exitcond = icmp eq i64 %indvars.iv.next, 25
39 br i1 %exitcond, label %for.end, label %for.body
40
41 for.end: ; preds = %for.body
42 ret void
43 }
44 define void @foo_2s(float* %src) {
45 entry:
46 %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
47 %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
48 %tmp = bitcast float* %arrayidx1 to <2 x float>*
49 br label %for.body
50
51 ; CHECK-LABEL: %for.body
52 ; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
53 ; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
54 ; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
55 for.body: ; preds = %for.body, %entry
56 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
57 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
58 %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
59 %tmp1 = load float, float* %arrayidx3, align 8
60 %add = fadd fast float %tmp1, %tmp1
61 %mul = fmul fast float %add, %add
62 %e1 = insertelement <2 x float> undef, float %add, i32 0
63 %e2 = insertelement <2 x float> %e1, float %add, i32 1
64 %add2 = fsub fast <2 x float> %e2,
65 %e3 = insertelement <2 x float> undef, float %mul, i32 0
66 %e4 = insertelement <2 x float> %e3, float %mul, i32 1
67 %mul2 = fmul fast <2 x float> %add2,
68 %e5 = insertelement <2 x float> undef, float %add, i32 0
69 %e6 = insertelement <2 x float> %e5, float %add, i32 1
70 %add3 = fsub fast <2 x float> , %mul2
71 %mulx = fmul fast <2 x float> %add2, %e2
72 %addx = fsub fast <2 x float> %e4, %mulx
73 %e7 = insertelement <2 x float> undef, float %mul, i32 0
74 %e8 = insertelement <2 x float> %e7, float %mul, i32 1
75 %e9 = fmul fast <2 x float> %addx, %add3
76 store <2 x float> %e9, <2 x float>* %tmp, align 8
77 %e10 = extractelement <2 x float> %add3, i32 0
78 %mul3 = fmul fast float %mul, %e10
79 %add4 = fsub fast float %mul, %mul3
80 store float %add4, float* %arrayidx2, align 8
81 %exitcond = icmp eq i64 %indvars.iv.next, 25
82 br i1 %exitcond, label %for.end, label %for.body
83
84 for.end: ; preds = %for.body
85 ret void
86 }
87 define void @foo_4s(float* %src) {
88 entry:
89 %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
90 %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
91 %tmp = bitcast float* %arrayidx1 to <4 x float>*
92 br label %for.body
93
94 ; CHECK-LABEL: %for.body
95 ; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
96 ; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
97 for.body: ; preds = %for.body, %entry
98 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
99 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
100 %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
101 %tmp1 = load float, float* %arrayidx3, align 8
102 %add = fadd fast float %tmp1, %tmp1
103 %mul = fmul fast float %add, %add
104 %e1 = insertelement <4 x float> undef, float %add, i32 0
105 %e2 = insertelement <4 x float> %e1, float %add, i32 1
106 %add2 = fadd fast <4 x float> %e2,
107 %e3 = insertelement <4 x float> undef, float %mul, i32 0
108 %e4 = insertelement <4 x float> %e3, float %mul, i32 1
109 %mul2 = fmul fast <4 x float> %add2,
110 %e5 = insertelement <4 x float> undef, float %add, i32 0
111 %e6 = insertelement <4 x float> %e5, float %add, i32 1
112 %add3 = fsub fast <4 x float> , %mul2
113 %mulx = fmul fast <4 x float> %add2, %e2
114 %addx = fsub fast <4 x float> %e4, %mulx
115 %e7 = insertelement <4 x float> undef, float %mul, i32 0
116 %e8 = insertelement <4 x float> %e7, float %mul, i32 1
117 %e9 = fmul fast <4 x float> %addx, %add3
118 store <4 x float> %e9, <4 x float>* %tmp, align 8
119 %e10 = extractelement <4 x float> %add3, i32 0
120 %mul3 = fmul fast float %mul, %e10
121 store float %mul3, float* %arrayidx2, align 8
122 %exitcond = icmp eq i64 %indvars.iv.next, 25
123 br i1 %exitcond, label %for.end, label %for.body
124
125 for.end: ; preds = %for.body
126 ret void
127 }