llvm.org GIT mirror llvm / 3f71db1
[MachineCombiner] Support for floating-point FMA on ARM64 (re-commit r267098) The original patch caused crashes because it could derefence a null pointer for SelectionDAGTargetInfo for targets that do not define it. Evaluates fmul+fadd -> fmadd combines and similar code sequences in the machine combiner. It adds support for float and double similar to the existing integer implementation. The key features are: - DAGCombiner checks whether it should combine greedily or let the machine combiner do the evaluation. This is only supported on ARM64. - It gives preference to throughput over latency: the heuristic used is to combine always in loops. The targets decides whether the machine combiner should optimize for throughput or latency. - Supports for fmadd, f(n)msub, fmla, fmls patterns - On by default at O3 ffast-math git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@267328 91177308-0d34-0410-b5e6-96231b3b80d8 Gerolf Hoflehner 3 years ago
12 changed file(s) with 894 addition(s) and 41 deletion(s). Raw diff Collapse all Expand all
3737 MULSUBX_OP1,
3838 MULSUBX_OP2,
3939 MULADDXI_OP1,
40 MULSUBXI_OP1
40 MULSUBXI_OP1,
41 // Floating Point
42 FMULADDS_OP1,
43 FMULADDS_OP2,
44 FMULSUBS_OP1,
45 FMULSUBS_OP2,
46 FMULADDD_OP1,
47 FMULADDD_OP2,
48 FMULSUBD_OP1,
49 FMULSUBD_OP2,
50 FMLAv1i32_indexed_OP1,
51 FMLAv1i32_indexed_OP2,
52 FMLAv1i64_indexed_OP1,
53 FMLAv1i64_indexed_OP2,
54 FMLAv2f32_OP2,
55 FMLAv2f32_OP1,
56 FMLAv2f64_OP1,
57 FMLAv2f64_OP2,
58 FMLAv2i32_indexed_OP1,
59 FMLAv2i32_indexed_OP2,
60 FMLAv2i64_indexed_OP1,
61 FMLAv2i64_indexed_OP2,
62 FMLAv4f32_OP1,
63 FMLAv4f32_OP2,
64 FMLAv4i32_indexed_OP1,
65 FMLAv4i32_indexed_OP2,
66 FMLSv1i32_indexed_OP2,
67 FMLSv1i64_indexed_OP2,
68 FMLSv2i32_indexed_OP2,
69 FMLSv2i64_indexed_OP2,
70 FMLSv2f32_OP2,
71 FMLSv2f64_OP2,
72 FMLSv4i32_indexed_OP2,
73 FMLSv4f32_OP2
4174 };
4275
4376 } // end namespace llvm
1616 #define LLVM_CODEGEN_SELECTIONDAGTARGETINFO_H
1717
1818 #include "llvm/CodeGen/SelectionDAGNodes.h"
19 #include "llvm/Support/CodeGen.h"
1920
2021 namespace llvm {
2122
137138 MachinePointerInfo SrcPtrInfo) const {
138139 return std::make_pair(SDValue(), SDValue());
139140 }
141 // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather
142 // than FMUL and ADD is delegated to the machine combiner.
143 virtual bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const {
144 return false;
145 }
140146 };
141147
142148 } // end llvm namespace
817817 MachineInstr &Root,
818818 SmallVectorImpl &Patterns) const;
819819
820 /// Return true when a code sequence can improve throughput. It
821 /// should be called only for instructions in loops.
822 /// \param Pattern - combiner pattern
823 virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const;
824
820825 /// Return true if the input \P Inst is part of a chain of dependent ops
821826 /// that are suitable for reassociation, otherwise return false.
822827 /// If the instruction's operands must be commuted to have a previous
3939 const TargetRegisterInfo *TRI;
4040 MCSchedModel SchedModel;
4141 MachineRegisterInfo *MRI;
42 MachineLoopInfo *MLI; // Current MachineLoopInfo
4243 MachineTraceMetrics *Traces;
4344 MachineTraceMetrics::Ensemble *MinInstr;
4445
8586
8687 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
8788 "Machine InstCombiner", false, false)
89 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
8890 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
8991 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
9092 false, false)
9294 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
9395 AU.setPreservesCFG();
9496 AU.addPreserved();
97 AU.addRequired();
9598 AU.addPreserved();
9699 AU.addRequired();
97100 AU.addPreserved();
353356 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
354357
355358 auto BlockIter = MBB->begin();
359 // Check if the block is in a loop.
360 const MachineLoop *ML = MLI->getLoopFor(MBB);
356361
357362 while (BlockIter != MBB->end()) {
358363 auto &MI = *BlockIter++;
405410 if (!NewInstCount)
406411 continue;
407412
413 bool SubstituteAlways = false;
414 if (ML && TII->isThroughputPattern(P))
415 SubstituteAlways = true;
416
408417 // Substitute when we optimize for codesize and the new sequence has
409418 // fewer instructions OR
410419 // the new sequence neither lengthens the critical path nor increases
411420 // resource pressure.
412 if (doSubstitute(NewInstCount, OldInstCount) ||
421 if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
413422 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
414423 InstrIdxForVirtReg, P) &&
415424 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
446455 SchedModel = STI.getSchedModel();
447456 TSchedModel.init(SchedModel, &STI, TII);
448457 MRI = &MF.getRegInfo();
458 MLI = &getAnalysis();
449459 Traces = &getAnalysis();
450460 MinInstr = nullptr;
451461 OptSize = MF.getFunction()->optForSize();
2323 #include "llvm/Analysis/AliasAnalysis.h"
2424 #include "llvm/CodeGen/MachineFrameInfo.h"
2525 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
2627 #include "llvm/IR/DataLayout.h"
2728 #include "llvm/IR/DerivedTypes.h"
2829 #include "llvm/IR/Function.h"
77157716 if (!HasFMAD && !HasFMA)
77167717 return SDValue();
77177718
7719 const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
7720 ;
7721 if (AllowFusion && STI && STI->GenerateFMAsInMachineCombiner(OptLevel))
7722 return SDValue();
7723
77187724 // Always prefer FMAD to FMA for precision.
77197725 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
77207726 bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
78967902
78977903 // No valid opcode, do not combine.
78987904 if (!HasFMAD && !HasFMA)
7905 return SDValue();
7906
7907 const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
7908 if (AllowFusion && STI && STI->GenerateFMAsInMachineCombiner(OptLevel))
78997909 return SDValue();
79007910
79017911 // Always prefer FMAD to FMA for precision.
83668376 AddToWorklist(Fused.getNode());
83678377 return Fused;
83688378 }
8369
83708379 return SDValue();
83718380 }
83728381
654654
655655 return false;
656656 }
657
657 /// Return true when a code sequence can improve loop throughput.
658 bool
659 TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
660 return false;
661 }
658662 /// Attempt the reassociation transformation to reduce critical path length.
659663 /// See the above comments before getMachineCombinerPatterns().
660664 void TargetInstrInfo::reassociateOps(
27862786 return false;
27872787 }
27882788 //
2789 // FP Opcodes that can be combined with a FMUL
2790 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
2791 switch (Inst.getOpcode()) {
2792 case AArch64::FADDSrr:
2793 case AArch64::FADDDrr:
2794 case AArch64::FADDv2f32:
2795 case AArch64::FADDv2f64:
2796 case AArch64::FADDv4f32:
2797 case AArch64::FSUBSrr:
2798 case AArch64::FSUBDrr:
2799 case AArch64::FSUBv2f32:
2800 case AArch64::FSUBv2f64:
2801 case AArch64::FSUBv4f32:
2802 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
2803 default:
2804 break;
2805 }
2806 return false;
2807 }
2808 //
27892809 // Opcodes that can be combined with a MUL
27902810 static bool isCombineInstrCandidate(unsigned Opc) {
27912811 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
27922812 }
27932813
2794 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
2795 unsigned MulOpc, unsigned ZeroReg) {
2814 //
2815 // Utility routine that checks if \param MO is defined by an
2816 // \param CombineOpc instruction in the basic block \param MBB
2817 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
2818 unsigned CombineOpc, unsigned ZeroReg = 0,
2819 bool CheckZeroReg = false) {
27962820 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
27972821 MachineInstr *MI = nullptr;
2798 // We need a virtual register definition.
2822
27992823 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
28002824 MI = MRI.getUniqueVRegDef(MO.getReg());
28012825 // And it needs to be in the trace (otherwise, it won't have a depth).
2802 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
2803 return false;
2804
2805 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
2806 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
2807 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
2808
2809 // The third input reg must be zero.
2810 if (MI->getOperand(3).getReg() != ZeroReg)
2811 return false;
2812
2826 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
2827 return false;
28132828 // Must only used by the user we combine with.
28142829 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
28152830 return false;
28162831
2832 if (CheckZeroReg) {
2833 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
2834 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
2835 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
2836 // The third input reg must be zero.
2837 if (MI->getOperand(3).getReg() != ZeroReg)
2838 return false;
2839 }
2840
28172841 return true;
2842 }
2843
2844 //
2845 // Is \param MO defined by an integer multiply and can be combined?
2846 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
2847 unsigned MulOpc, unsigned ZeroReg) {
2848 return canCombine(MBB, MO, MulOpc, ZeroReg, true);
2849 }
2850
2851 //
2852 // Is \param MO defined by a floating-point multiply and can be combined?
2853 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
2854 unsigned MulOpc) {
2855 return canCombine(MBB, MO, MulOpc);
28182856 }
28192857
28202858 // TODO: There are many more machine instruction opcodes to match:
29502988 }
29512989 return Found;
29522990 }
2953
2991 /// Floating-Point Support
2992
2993 /// Find instructions that can be turned into madd.
2994 static bool getFMAPatterns(MachineInstr &Root,
2995 SmallVectorImpl &Patterns) {
2996
2997 if (!isCombineInstrCandidateFP(Root))
2998 return 0;
2999
3000 MachineBasicBlock &MBB = *Root.getParent();
3001 bool Found = false;
3002
3003 switch (Root.getOpcode()) {
3004 default:
3005 assert(false && "Unsupported FP instruction in combiner\n");
3006 break;
3007 case AArch64::FADDSrr:
3008 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
3009 "FADDWrr does not have register operands");
3010 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3011 Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
3012 Found = true;
3013 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3014 AArch64::FMULv1i32_indexed)) {
3015 Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
3016 Found = true;
3017 }
3018 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3019 Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
3020 Found = true;
3021 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3022 AArch64::FMULv1i32_indexed)) {
3023 Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
3024 Found = true;
3025 }
3026 break;
3027 case AArch64::FADDDrr:
3028 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3029 Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
3030 Found = true;
3031 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3032 AArch64::FMULv1i64_indexed)) {
3033 Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
3034 Found = true;
3035 }
3036 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3037 Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
3038 Found = true;
3039 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3040 AArch64::FMULv1i64_indexed)) {
3041 Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
3042 Found = true;
3043 }
3044 break;
3045 case AArch64::FADDv2f32:
3046 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3047 AArch64::FMULv2i32_indexed)) {
3048 Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
3049 Found = true;
3050 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3051 AArch64::FMULv2f32)) {
3052 Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
3053 Found = true;
3054 }
3055 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3056 AArch64::FMULv2i32_indexed)) {
3057 Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
3058 Found = true;
3059 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3060 AArch64::FMULv2f32)) {
3061 Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
3062 Found = true;
3063 }
3064 break;
3065 case AArch64::FADDv2f64:
3066 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3067 AArch64::FMULv2i64_indexed)) {
3068 Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
3069 Found = true;
3070 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3071 AArch64::FMULv2f64)) {
3072 Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
3073 Found = true;
3074 }
3075 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3076 AArch64::FMULv2i64_indexed)) {
3077 Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
3078 Found = true;
3079 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3080 AArch64::FMULv2f64)) {
3081 Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
3082 Found = true;
3083 }
3084 break;
3085 case AArch64::FADDv4f32:
3086 if (canCombineWithFMUL(MBB, Root.getOperand(1),
3087 AArch64::FMULv4i32_indexed)) {
3088 Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
3089 Found = true;
3090 } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
3091 AArch64::FMULv4f32)) {
3092 Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
3093 Found = true;
3094 }
3095 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3096 AArch64::FMULv4i32_indexed)) {
3097 Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
3098 Found = true;
3099 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3100 AArch64::FMULv4f32)) {
3101 Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
3102 Found = true;
3103 }
3104 break;
3105
3106 case AArch64::FSUBSrr:
3107 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
3108 Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
3109 Found = true;
3110 }
3111 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
3112 Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
3113 Found = true;
3114 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3115 AArch64::FMULv1i32_indexed)) {
3116 Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
3117 Found = true;
3118 }
3119 break;
3120 case AArch64::FSUBDrr:
3121 if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
3122 Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
3123 Found = true;
3124 }
3125 if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
3126 Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
3127 Found = true;
3128 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3129 AArch64::FMULv1i64_indexed)) {
3130 Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
3131 Found = true;
3132 }
3133 break;
3134 case AArch64::FSUBv2f32:
3135 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3136 AArch64::FMULv2i32_indexed)) {
3137 Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
3138 Found = true;
3139 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3140 AArch64::FMULv2f32)) {
3141 Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
3142 Found = true;
3143 }
3144 break;
3145 case AArch64::FSUBv2f64:
3146 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3147 AArch64::FMULv2i64_indexed)) {
3148 Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
3149 Found = true;
3150 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3151 AArch64::FMULv2f64)) {
3152 Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
3153 Found = true;
3154 }
3155 break;
3156 case AArch64::FSUBv4f32:
3157 if (canCombineWithFMUL(MBB, Root.getOperand(2),
3158 AArch64::FMULv4i32_indexed)) {
3159 Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
3160 Found = true;
3161 } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
3162 AArch64::FMULv4f32)) {
3163 Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
3164 Found = true;
3165 }
3166 break;
3167 }
3168 return Found;
3169 }
3170
3171 /// Return true when a code sequence can improve throughput. It
3172 /// should be called only for instructions in loops.
3173 /// \param Pattern - combiner pattern
3174 bool
3175 AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
3176 switch (Pattern) {
3177 default:
3178 break;
3179 case MachineCombinerPattern::FMULADDS_OP1:
3180 case MachineCombinerPattern::FMULADDS_OP2:
3181 case MachineCombinerPattern::FMULSUBS_OP1:
3182 case MachineCombinerPattern::FMULSUBS_OP2:
3183 case MachineCombinerPattern::FMULADDD_OP1:
3184 case MachineCombinerPattern::FMULADDD_OP2:
3185 case MachineCombinerPattern::FMULSUBD_OP1:
3186 case MachineCombinerPattern::FMULSUBD_OP2:
3187 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
3188 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
3189 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
3190 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
3191 case MachineCombinerPattern::FMLAv2f32_OP2:
3192 case MachineCombinerPattern::FMLAv2f32_OP1:
3193 case MachineCombinerPattern::FMLAv2f64_OP1:
3194 case MachineCombinerPattern::FMLAv2f64_OP2:
3195 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
3196 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
3197 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
3198 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
3199 case MachineCombinerPattern::FMLAv4f32_OP1:
3200 case MachineCombinerPattern::FMLAv4f32_OP2:
3201 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
3202 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
3203 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
3204 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
3205 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
3206 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
3207 case MachineCombinerPattern::FMLSv2f32_OP2:
3208 case MachineCombinerPattern::FMLSv2f64_OP2:
3209 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
3210 case MachineCombinerPattern::FMLSv4f32_OP2:
3211 return true;
3212 } // end switch (Pattern)
3213 return false;
3214 }
29543215 /// Return true when there is potentially a faster code sequence for an
29553216 /// instruction chain ending in \p Root. All potential patterns are listed in
29563217 /// the \p Pattern vector. Pattern should be sorted in priority order since the
29593220 bool AArch64InstrInfo::getMachineCombinerPatterns(
29603221 MachineInstr &Root,
29613222 SmallVectorImpl &Patterns) const {
3223 // Integer patterns
29623224 if (getMaddPatterns(Root, Patterns))
29633225 return true;
3226 // Floating point patterns
3227 if (getFMAPatterns(Root, Patterns))
3228 return true;
29643229
29653230 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
29663231 }
29673232
2968 /// genMadd - Generate madd instruction and combine mul and add.
2969 /// Example:
2970 /// MUL I=A,B,0
2971 /// ADD R,I,C
2972 /// ==> MADD R,A,B,C
2973 /// \param Root is the ADD instruction
3233 enum class FMAInstKind { Default, Indexed, Accumulator };
3234 /// genFusedMultiply - Generate fused multiply instructions.
3235 /// This function supports both integer and floating point instructions.
3236 /// A typical example:
3237 /// F|MUL I=A,B,0
3238 /// F|ADD R,I,C
3239 /// ==> F|MADD R,A,B,C
3240 /// \param Root is the F|ADD instruction
29743241 /// \param [out] InsInstrs is a vector of machine instructions and will
29753242 /// contain the generated madd instruction
29763243 /// \param IdxMulOpd is index of operand in Root that is the result of
2977 /// the MUL. In the example above IdxMulOpd is 1.
2978 /// \param MaddOpc the opcode fo the madd instruction
2979 static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
2980 const TargetInstrInfo *TII, MachineInstr &Root,
2981 SmallVectorImpl &InsInstrs,
2982 unsigned IdxMulOpd, unsigned MaddOpc,
2983 const TargetRegisterClass *RC) {
3244 /// the F|MUL. In the example above IdxMulOpd is 1.
3245 /// \param MaddOpc the opcode fo the f|madd instruction
3246 static MachineInstr *
3247 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
3248 const TargetInstrInfo *TII, MachineInstr &Root,
3249 SmallVectorImpl &InsInstrs, unsigned IdxMulOpd,
3250 unsigned MaddOpc, const TargetRegisterClass *RC,
3251 FMAInstKind kind = FMAInstKind::Default) {
29843252 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
29853253
29863254 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
30023270 if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
30033271 MRI.constrainRegClass(SrcReg2, RC);
30043272
3005 MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
3006 ResultReg)
3007 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3008 .addReg(SrcReg1, getKillRegState(Src1IsKill))
3009 .addReg(SrcReg2, getKillRegState(Src2IsKill));
3010 // Insert the MADD
3273 MachineInstrBuilder MIB;
3274 if (kind == FMAInstKind::Default)
3275 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3276 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3277 .addReg(SrcReg1, getKillRegState(Src1IsKill))
3278 .addReg(SrcReg2, getKillRegState(Src2IsKill));
3279 else if (kind == FMAInstKind::Indexed)
3280 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3281 .addReg(SrcReg2, getKillRegState(Src2IsKill))
3282 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3283 .addReg(SrcReg1, getKillRegState(Src1IsKill))
3284 .addImm(MUL->getOperand(3).getImm());
3285 else if (kind == FMAInstKind::Accumulator)
3286 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
3287 .addReg(SrcReg2, getKillRegState(Src2IsKill))
3288 .addReg(SrcReg0, getKillRegState(Src0IsKill))
3289 .addReg(SrcReg1, getKillRegState(Src1IsKill));
3290 else
3291 assert(false && "Invalid FMA instruction kind \n");
3292 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
30113293 InsInstrs.push_back(MIB);
30123294 return MUL;
30133295 }
30953377 Opc = AArch64::MADDXrrr;
30963378 RC = &AArch64::GPR64RegClass;
30973379 }
3098 MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
3380 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
30993381 break;
31003382 case MachineCombinerPattern::MULADDW_OP2:
31013383 case MachineCombinerPattern::MULADDX_OP2:
31103392 Opc = AArch64::MADDXrrr;
31113393 RC = &AArch64::GPR64RegClass;
31123394 }
3113 MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
3395 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
31143396 break;
31153397 case MachineCombinerPattern::MULADDWI_OP1:
31163398 case MachineCombinerPattern::MULADDXI_OP1: {
32023484 Opc = AArch64::MSUBXrrr;
32033485 RC = &AArch64::GPR64RegClass;
32043486 }
3205 MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
3487 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
32063488 break;
32073489 case MachineCombinerPattern::MULSUBWI_OP1:
32083490 case MachineCombinerPattern::MULSUBXI_OP1: {
32443526 InsInstrs.push_back(MIB1);
32453527 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
32463528 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
3529 }
3530 break;
3531 }
3532 // Floating Point Support
3533 case MachineCombinerPattern::FMULADDS_OP1:
3534 case MachineCombinerPattern::FMULADDD_OP1:
3535 // MUL I=A,B,0
3536 // ADD R,I,C
3537 // ==> MADD R,A,B,C
3538 // --- Create(MADD);
3539 if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
3540 Opc = AArch64::FMADDSrrr;
3541 RC = &AArch64::FPR32RegClass;
3542 } else {
3543 Opc = AArch64::FMADDDrrr;
3544 RC = &AArch64::FPR64RegClass;
3545 }
3546 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
3547 break;
3548 case MachineCombinerPattern::FMULADDS_OP2:
3549 case MachineCombinerPattern::FMULADDD_OP2:
3550 // FMUL I=A,B,0
3551 // FADD R,C,I
3552 // ==> FMADD R,A,B,C
3553 // --- Create(FMADD);
3554 if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
3555 Opc = AArch64::FMADDSrrr;
3556 RC = &AArch64::FPR32RegClass;
3557 } else {
3558 Opc = AArch64::FMADDDrrr;
3559 RC = &AArch64::FPR64RegClass;
3560 }
3561 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
3562 break;
3563
3564 case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
3565 Opc = AArch64::FMLAv1i32_indexed;
3566 RC = &AArch64::FPR32RegClass;
3567 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3568 FMAInstKind::Indexed);
3569 break;
3570 case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
3571 Opc = AArch64::FMLAv1i32_indexed;
3572 RC = &AArch64::FPR32RegClass;
3573 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3574 FMAInstKind::Indexed);
3575 break;
3576
3577 case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
3578 Opc = AArch64::FMLAv1i64_indexed;
3579 RC = &AArch64::FPR64RegClass;
3580 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3581 FMAInstKind::Indexed);
3582 break;
3583 case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
3584 Opc = AArch64::FMLAv1i64_indexed;
3585 RC = &AArch64::FPR64RegClass;
3586 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3587 FMAInstKind::Indexed);
3588 break;
3589
3590 case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
3591 case MachineCombinerPattern::FMLAv2f32_OP1:
3592 RC = &AArch64::FPR64RegClass;
3593 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
3594 Opc = AArch64::FMLAv2i32_indexed;
3595 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3596 FMAInstKind::Indexed);
3597 } else {
3598 Opc = AArch64::FMLAv2f32;
3599 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3600 FMAInstKind::Accumulator);
3601 }
3602 break;
3603 case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
3604 case MachineCombinerPattern::FMLAv2f32_OP2:
3605 RC = &AArch64::FPR64RegClass;
3606 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
3607 Opc = AArch64::FMLAv2i32_indexed;
3608 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3609 FMAInstKind::Indexed);
3610 } else {
3611 Opc = AArch64::FMLAv2f32;
3612 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3613 FMAInstKind::Accumulator);
3614 }
3615 break;
3616
3617 case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
3618 case MachineCombinerPattern::FMLAv2f64_OP1:
3619 RC = &AArch64::FPR128RegClass;
3620 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
3621 Opc = AArch64::FMLAv2i64_indexed;
3622 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3623 FMAInstKind::Indexed);
3624 } else {
3625 Opc = AArch64::FMLAv2f64;
3626 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3627 FMAInstKind::Accumulator);
3628 }
3629 break;
3630 case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
3631 case MachineCombinerPattern::FMLAv2f64_OP2:
3632 RC = &AArch64::FPR128RegClass;
3633 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
3634 Opc = AArch64::FMLAv2i64_indexed;
3635 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3636 FMAInstKind::Indexed);
3637 } else {
3638 Opc = AArch64::FMLAv2f64;
3639 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3640 FMAInstKind::Accumulator);
3641 }
3642 break;
3643
3644 case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
3645 case MachineCombinerPattern::FMLAv4f32_OP1:
3646 RC = &AArch64::FPR128RegClass;
3647 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
3648 Opc = AArch64::FMLAv4i32_indexed;
3649 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3650 FMAInstKind::Indexed);
3651 } else {
3652 Opc = AArch64::FMLAv4f32;
3653 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
3654 FMAInstKind::Accumulator);
3655 }
3656 break;
3657
3658 case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
3659 case MachineCombinerPattern::FMLAv4f32_OP2:
3660 RC = &AArch64::FPR128RegClass;
3661 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
3662 Opc = AArch64::FMLAv4i32_indexed;
3663 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3664 FMAInstKind::Indexed);
3665 } else {
3666 Opc = AArch64::FMLAv4f32;
3667 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3668 FMAInstKind::Accumulator);
3669 }
3670 break;
3671
3672 case MachineCombinerPattern::FMULSUBS_OP1:
3673 case MachineCombinerPattern::FMULSUBD_OP1: {
3674 // FMUL I=A,B,0
3675 // FSUB R,I,C
3676 // ==> FNMSUB R,A,B,C // = -C + A*B
3677 // --- Create(FNMSUB);
3678 if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
3679 Opc = AArch64::FNMSUBSrrr;
3680 RC = &AArch64::FPR32RegClass;
3681 } else {
3682 Opc = AArch64::FNMSUBDrrr;
3683 RC = &AArch64::FPR64RegClass;
3684 }
3685 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
3686 break;
3687 }
3688 case MachineCombinerPattern::FMULSUBS_OP2:
3689 case MachineCombinerPattern::FMULSUBD_OP2: {
3690 // FMUL I=A,B,0
3691 // FSUB R,C,I
3692 // ==> FMSUB R,A,B,C (computes C - A*B)
3693 // --- Create(FMSUB);
3694 if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
3695 Opc = AArch64::FMSUBSrrr;
3696 RC = &AArch64::FPR32RegClass;
3697 } else {
3698 Opc = AArch64::FMSUBDrrr;
3699 RC = &AArch64::FPR64RegClass;
3700 }
3701 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
3702 break;
3703
3704 case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
3705 Opc = AArch64::FMLSv1i32_indexed;
3706 RC = &AArch64::FPR32RegClass;
3707 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3708 FMAInstKind::Indexed);
3709 break;
3710
3711 case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
3712 Opc = AArch64::FMLSv1i64_indexed;
3713 RC = &AArch64::FPR64RegClass;
3714 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3715 FMAInstKind::Indexed);
3716 break;
3717
3718 case MachineCombinerPattern::FMLSv2f32_OP2:
3719 case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
3720 RC = &AArch64::FPR64RegClass;
3721 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
3722 Opc = AArch64::FMLSv2i32_indexed;
3723 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3724 FMAInstKind::Indexed);
3725 } else {
3726 Opc = AArch64::FMLSv2f32;
3727 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3728 FMAInstKind::Accumulator);
3729 }
3730 break;
3731
3732 case MachineCombinerPattern::FMLSv2f64_OP2:
3733 case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
3734 RC = &AArch64::FPR128RegClass;
3735 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
3736 Opc = AArch64::FMLSv2i64_indexed;
3737 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3738 FMAInstKind::Indexed);
3739 } else {
3740 Opc = AArch64::FMLSv2f64;
3741 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3742 FMAInstKind::Accumulator);
3743 }
3744 break;
3745
3746 case MachineCombinerPattern::FMLSv4f32_OP2:
3747 case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
3748 RC = &AArch64::FPR128RegClass;
3749 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
3750 Opc = AArch64::FMLSv4i32_indexed;
3751 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3752 FMAInstKind::Indexed);
3753 } else {
3754 Opc = AArch64::FMLSv4f32;
3755 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
3756 FMAInstKind::Accumulator);
32473757 }
32483758 break;
32493759 }
173173 unsigned SrcReg2, int CmpMask, int CmpValue,
174174 const MachineRegisterInfo *MRI) const override;
175175 bool optimizeCondBranch(MachineInstr *MI) const override;
176
177 /// Return true when a code sequence can improve throughput. It
178 /// should be called only for instructions in loops.
179 /// \param Pattern - combiner pattern
180 bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
176181 /// Return true when there is potentially a faster code sequence
177182 /// for an instruction chain ending in . All potential patterns are
178183 /// listed in the array.
5050 }
5151 return SDValue();
5252 }
53 bool AArch64SelectionDAGInfo::GenerateFMAsInMachineCombiner(
54 CodeGenOpt::Level OptLevel) const {
55 if (OptLevel >= CodeGenOpt::Aggressive)
56 return true;
57 return false;
58 }
2424 SDValue Dst, SDValue Src, SDValue Size,
2525 unsigned Align, bool isVolatile,
2626 MachinePointerInfo DstPtrInfo) const override;
27 bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
2728 };
2829 }
2930
0 ; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
1 define void @foo_2d(double* %src) {
2 ; CHECK-LABEL: %entry
3 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
4 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
5 entry:
6 %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
7 %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
8 %tmp = bitcast double* %arrayidx1 to <2 x double>*
9 %tmp1 = load double, double* %arrayidx2, align 8
10 %tmp2 = load double, double* %arrayidx1, align 8
11 %fmul = fmul fast double %tmp1, %tmp1
12 %fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B
13 %fadd = fadd fast double %fmul, %fmul2
14 br label %for.body
15
16 ; CHECK-LABEL: %for.body
17 ; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
18 ; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
19 ; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
20 for.body: ; preds = %for.body, %entry
21 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
22 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
23 %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
24 %tmp3 = load double, double* %arrayidx3, align 8
25 %add = fadd fast double %tmp3, %tmp3
26 %mul = fmul fast double %add, %fadd
27 %e1 = insertelement <2 x double> undef, double %add, i32 0
28 %e2 = insertelement <2 x double> %e1, double %add, i32 1
29 %add2 = fadd fast <2 x double> %e2,
30 %e3 = insertelement <2 x double> undef, double %mul, i32 0
31 %e4 = insertelement <2 x double> %e3, double %mul, i32 1
32 %mul2 = fmul fast <2 x double> %add2,
33 %e5 = insertelement <2 x double> undef, double %add, i32 0
34 %e6 = insertelement <2 x double> %e5, double %add, i32 1
35 %add3 = fadd fast <2 x double> %mul2,
36 %mulx = fmul fast <2 x double> %add2, %e2
37 %addx = fadd fast <2 x double> %mulx, %e4
38 %e7 = insertelement <2 x double> undef, double %mul, i32 0
39 %e8 = insertelement <2 x double> %e7, double %mul, i32 1
40 %e9 = fmul fast <2 x double> %addx, %add3
41 store <2 x double> %e9, <2 x double>* %tmp, align 8
42 %e10 = extractelement <2 x double> %add3, i32 0
43 %mul3 = fmul fast double %mul, %e10
44 %add4 = fadd fast double %mul3, %mul
45 store double %add4, double* %arrayidx2, align 8
46 %exitcond = icmp eq i64 %indvars.iv.next, 25
47 br i1 %exitcond, label %for.end, label %for.body
48
49 for.end: ; preds = %for.body
50 ret void
51 }
52 define void @foo_2s(float* %src) {
53 entry:
54 %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
55 %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
56 %tmp = bitcast float* %arrayidx1 to <2 x float>*
57 br label %for.body
58
59 ; CHECK-LABEL: %for.body
60 ; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
61 ; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
62 ; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
63 for.body: ; preds = %for.body, %entry
64 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
65 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
66 %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
67 %tmp1 = load float, float* %arrayidx3, align 8
68 %add = fadd fast float %tmp1, %tmp1
69 %mul = fmul fast float %add, %add
70 %e1 = insertelement <2 x float> undef, float %add, i32 0
71 %e2 = insertelement <2 x float> %e1, float %add, i32 1
72 %add2 = fadd fast <2 x float> %e2,
73 %e3 = insertelement <2 x float> undef, float %mul, i32 0
74 %e4 = insertelement <2 x float> %e3, float %mul, i32 1
75 %mul2 = fmul fast <2 x float> %add2,
76 %e5 = insertelement <2 x float> undef, float %add, i32 0
77 %e6 = insertelement <2 x float> %e5, float %add, i32 1
78 %add3 = fadd fast <2 x float> %mul2,
79 %mulx = fmul fast <2 x float> %add2, %e2
80 %addx = fadd fast <2 x float> %mulx, %e4
81 %e7 = insertelement <2 x float> undef, float %mul, i32 0
82 %e8 = insertelement <2 x float> %e7, float %mul, i32 1
83 %e9 = fmul fast <2 x float> %addx, %add3
84 store <2 x float> %e9, <2 x float>* %tmp, align 8
85 %e10 = extractelement <2 x float> %add3, i32 0
86 %mul3 = fmul fast float %mul, %e10
87 %add4 = fadd fast float %mul3, %mul
88 store float %add4, float* %arrayidx2, align 8
89 %exitcond = icmp eq i64 %indvars.iv.next, 25
90 br i1 %exitcond, label %for.end, label %for.body
91
92 for.end: ; preds = %for.body
93 ret void
94 }
95 define void @foo_4s(float* %src) {
96 entry:
97 %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
98 %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
99 %tmp = bitcast float* %arrayidx1 to <4 x float>*
100 br label %for.body
101
102 ; CHECK-LABEL: %for.body
103 ; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
104 ; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
105 for.body: ; preds = %for.body, %entry
106 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
107 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
108 %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
109 %tmp1 = load float, float* %arrayidx3, align 8
110 %add = fadd fast float %tmp1, %tmp1
111 %mul = fmul fast float %add, %add
112 %e1 = insertelement <4 x float> undef, float %add, i32 0
113 %e2 = insertelement <4 x float> %e1, float %add, i32 1
114 %add2 = fadd fast <4 x float> %e2,
115 %e3 = insertelement <4 x float> undef, float %mul, i32 0
116 %e4 = insertelement <4 x float> %e3, float %mul, i32 1
117 %mul2 = fmul fast <4 x float> %add2,
118 %e5 = insertelement <4 x float> undef, float %add, i32 0
119 %e6 = insertelement <4 x float> %e5, float %add, i32 1
120 %add3 = fadd fast <4 x float> %mul2,
121 %mulx = fmul fast <4 x float> %add2, %e2
122 %addx = fadd fast <4 x float> %mulx, %e4
123 %e7 = insertelement <4 x float> undef, float %mul, i32 0
124 %e8 = insertelement <4 x float> %e7, float %mul, i32 1
125 %e9 = fmul fast <4 x float> %addx, %add3
126 store <4 x float> %e9, <4 x float>* %tmp, align 8
127 %e10 = extractelement <4 x float> %add3, i32 0
128 %mul3 = fmul fast float %mul, %e10
129 store float %mul3, float* %arrayidx2, align 8
130 %exitcond = icmp eq i64 %indvars.iv.next, 25
131 br i1 %exitcond, label %for.end, label %for.body
132
133 for.end: ; preds = %for.body
134 ret void
135 }
0 ; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
1 define void @foo_2d(double* %src) {
2 entry:
3 %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
4 %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
5 %tmp = bitcast double* %arrayidx1 to <2 x double>*
6 br label %for.body
7
8 ; CHECK-LABEL: %for.body
9 ; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
10 ; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
11 ; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
12 for.body: ; preds = %for.body, %entry
13 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
14 %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
15 %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next
16 %tmp1 = load double, double* %arrayidx3, align 8
17 %add = fadd fast double %tmp1, %tmp1
18 %mul = fmul fast double %add, %add
19 %e1 = insertelement <2 x double> undef, double %add, i32 0
20 %e2 = insertelement <2 x double> %e1, double %add, i32 1
21 %sub2 = fsub fast <2 x double> %e2,
22 %e3 = insertelement <2 x double> undef, double %mul, i32 0
23 %e4 = insertelement <2 x double> %e3, double %mul, i32 1
24 %mul2 = fmul fast <2 x double> %sub2,
25 %e5 = insertelement <2 x double> undef, double %add, i32 0
26 %e6 = insertelement <2 x double> %e5, double %add, i32 1
27 %sub3 = fsub fast <2 x double> , %mul2
28 %mulx = fmul fast <2 x double> %sub2, %e2
29 %subx = fsub fast <2 x double> %e4, %mulx
30 %e7 = insertelement <2 x double> undef, double %mul, i32 0
31 %e8 = insertelement <2 x double> %e7, double %mul, i32 1
32 %e9 = fmul fast <2 x double> %subx, %sub3
33 store <2 x double> %e9, <2 x double>* %tmp, align 8
34 %e10 = extractelement <2 x double> %sub3, i32 0
35 %mul3 = fmul fast double %mul, %e10
36 %sub4 = fsub fast double %mul, %mul3
37 store double %sub4, double* %arrayidx2, align 8
38 %exitcond = icmp eq i64 %indvars.iv.next, 25
39 br i1 %exitcond, label %for.end, label %for.body
40
41 for.end: ; preds = %for.body
42 ret void
43 }
44 define void @foo_2s(float* %src) {
45 entry:
46 %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
47 %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
48 %tmp = bitcast float* %arrayidx1 to <2 x float>*
49 br label %for.body
50
51 ; CHECK-LABEL: %for.body
52 ; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
53 ; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
54 ; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
55 for.body: ; preds = %for.body, %entry
56 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
57 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
58 %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
59 %tmp1 = load float, float* %arrayidx3, align 8
60 %add = fadd fast float %tmp1, %tmp1
61 %mul = fmul fast float %add, %add
62 %e1 = insertelement <2 x float> undef, float %add, i32 0
63 %e2 = insertelement <2 x float> %e1, float %add, i32 1
64 %add2 = fsub fast <2 x float> %e2,
65 %e3 = insertelement <2 x float> undef, float %mul, i32 0
66 %e4 = insertelement <2 x float> %e3, float %mul, i32 1
67 %mul2 = fmul fast <2 x float> %add2,
68 %e5 = insertelement <2 x float> undef, float %add, i32 0
69 %e6 = insertelement <2 x float> %e5, float %add, i32 1
70 %add3 = fsub fast <2 x float> , %mul2
71 %mulx = fmul fast <2 x float> %add2, %e2
72 %addx = fsub fast <2 x float> %e4, %mulx
73 %e7 = insertelement <2 x float> undef, float %mul, i32 0
74 %e8 = insertelement <2 x float> %e7, float %mul, i32 1
75 %e9 = fmul fast <2 x float> %addx, %add3
76 store <2 x float> %e9, <2 x float>* %tmp, align 8
77 %e10 = extractelement <2 x float> %add3, i32 0
78 %mul3 = fmul fast float %mul, %e10
79 %add4 = fsub fast float %mul, %mul3
80 store float %add4, float* %arrayidx2, align 8
81 %exitcond = icmp eq i64 %indvars.iv.next, 25
82 br i1 %exitcond, label %for.end, label %for.body
83
84 for.end: ; preds = %for.body
85 ret void
86 }
87 define void @foo_4s(float* %src) {
88 entry:
89 %arrayidx1 = getelementptr inbounds float, float* %src, i64 5
90 %arrayidx2 = getelementptr inbounds float, float* %src, i64 11
91 %tmp = bitcast float* %arrayidx1 to <4 x float>*
92 br label %for.body
93
94 ; CHECK-LABEL: %for.body
95 ; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
96 ; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
97 for.body: ; preds = %for.body, %entry
98 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
99 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
100 %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next
101 %tmp1 = load float, float* %arrayidx3, align 8
102 %add = fadd fast float %tmp1, %tmp1
103 %mul = fmul fast float %add, %add
104 %e1 = insertelement <4 x float> undef, float %add, i32 0
105 %e2 = insertelement <4 x float> %e1, float %add, i32 1
106 %add2 = fadd fast <4 x float> %e2,
107 %e3 = insertelement <4 x float> undef, float %mul, i32 0
108 %e4 = insertelement <4 x float> %e3, float %mul, i32 1
109 %mul2 = fmul fast <4 x float> %add2,
110 %e5 = insertelement <4 x float> undef, float %add, i32 0
111 %e6 = insertelement <4 x float> %e5, float %add, i32 1
112 %add3 = fsub fast <4 x float> , %mul2
113 %mulx = fmul fast <4 x float> %add2, %e2
114 %addx = fsub fast <4 x float> %e4, %mulx
115 %e7 = insertelement <4 x float> undef, float %mul, i32 0
116 %e8 = insertelement <4 x float> %e7, float %mul, i32 1
117 %e9 = fmul fast <4 x float> %addx, %add3
118 store <4 x float> %e9, <4 x float>* %tmp, align 8
119 %e10 = extractelement <4 x float> %add3, i32 0
120 %mul3 = fmul fast float %mul, %e10
121 store float %mul3, float* %arrayidx2, align 8
122 %exitcond = icmp eq i64 %indvars.iv.next, 25
123 br i1 %exitcond, label %for.end, label %for.body
124
125 for.end: ; preds = %for.body
126 ret void
127 }