llvm.org GIT mirror llvm / 827307b
Use PPC reciprocal estimates with Newton iteration in fast-math mode When unsafe FP math operations are enabled, we can use the fre[s] and frsqrte[s] instructions, which generate reciprocal (sqrt) estimates, together with some Newton iteration, in order to quickly generate floating-point division and sqrt results. All of these instructions are separately optional, and so each has its own feature flag (except for the Altivec instructions, which are covered under the existing Altivec flag). Doing this is not only faster than using the IEEE-compliant fdiv/fsqrt instructions, but allows these computations to be pipelined with other computations in order to hide their overall latency. I've also added a couple of missing fnmsub patterns which turned out to be missing (but are necessary for good code generation of the Newton iterations). Altivec needs a similar fix, but that will probably be more complicated because fneg is expanded for Altivec's v4f32. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178617 91177308-0d34-0410-b5e6-96231b3b80d8 Hal Finkel 7 years ago
8 changed file(s) with 501 addition(s) and 32 deletion(s). Raw diff Collapse all Expand all
5656 "Enable the MFOCRF instruction">;
5757 def FeatureFSqrt : SubtargetFeature<"fsqrt","HasFSQRT", "true",
5858 "Enable the fsqrt instruction">;
59 def FeatureFRE : SubtargetFeature<"fre", "HasFRE", "true",
60 "Enable the fre instruction">;
61 def FeatureFRES : SubtargetFeature<"fres", "HasFRES", "true",
62 "Enable the fres instruction">;
63 def FeatureFRSQRTE : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
64 "Enable the frsqrte instruction">;
65 def FeatureFRSQRTES : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
66 "Enable the frsqrtes instruction">;
67 def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
68 "Assume higher precision reciprocal estimates">;
5969 def FeatureSTFIWX : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
6070 "Enable the stfiwx instruction">;
6171 def FeatureLFIWAX : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
100110
101111 def : Processor<"generic", G3Itineraries, [Directive32]>;
102112 def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
113 FeatureFRES, FeatureFRSQRTE,
103114 FeatureBookE]>;
104115 def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
116 FeatureFRES, FeatureFRSQRTE,
105117 FeatureBookE]>;
106118 def : Processor<"601", G3Itineraries, [Directive601]>;
107119 def : Processor<"602", G3Itineraries, [Directive602]>;
108 def : Processor<"603", G3Itineraries, [Directive603]>;
109 def : Processor<"603e", G3Itineraries, [Directive603]>;
110 def : Processor<"603ev", G3Itineraries, [Directive603]>;
111 def : Processor<"604", G3Itineraries, [Directive604]>;
112 def : Processor<"604e", G3Itineraries, [Directive604]>;
113 def : Processor<"620", G3Itineraries, [Directive620]>;
114 def : Processor<"750", G4Itineraries, [Directive750]>;
115 def : Processor<"g3", G3Itineraries, [Directive750]>;
116 def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>;
117 def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>;
118 def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
119 def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
120 def : Processor<"603", G3Itineraries, [Directive603,
121 FeatureFRES, FeatureFRSQRTE]>;
122 def : Processor<"603e", G3Itineraries, [Directive603,
123 FeatureFRES, FeatureFRSQRTE]>;
124 def : Processor<"603ev", G3Itineraries, [Directive603,
125 FeatureFRES, FeatureFRSQRTE]>;
126 def : Processor<"604", G3Itineraries, [Directive604,
127 FeatureFRES, FeatureFRSQRTE]>;
128 def : Processor<"604e", G3Itineraries, [Directive604,
129 FeatureFRES, FeatureFRSQRTE]>;
130 def : Processor<"620", G3Itineraries, [Directive620,
131 FeatureFRES, FeatureFRSQRTE]>;
132 def : Processor<"750", G4Itineraries, [Directive750,
133 FeatureFRES, FeatureFRSQRTE]>;
134 def : Processor<"g3", G3Itineraries, [Directive750,
135 FeatureFRES, FeatureFRSQRTE]>;
136 def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec,
137 FeatureFRES, FeatureFRSQRTE]>;
138 def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec,
139 FeatureFRES, FeatureFRSQRTE]>;
140 def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
141 FeatureFRES, FeatureFRSQRTE]>;
142 def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
143 FeatureFRES, FeatureFRSQRTE]>;
120144 def : Processor<"970", G5Itineraries,
121145 [Directive970, FeatureAltivec,
122 FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
146 FeatureMFOCRF, FeatureFSqrt,
147 FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
123148 Feature64Bit /*, Feature64BitRegs */]>;
124149 def : Processor<"g5", G5Itineraries,
125150 [Directive970, FeatureAltivec,
126151 FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
152 FeatureFRES, FeatureFRSQRTE,
127153 Feature64Bit /*, Feature64BitRegs */]>;
128154 def : ProcessorModel<"e500mc", PPCE500mcModel,
129155 [DirectiveE500mc, FeatureMFOCRF,
133159 FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
134160 def : Processor<"a2", PPCA2Itineraries,
135161 [DirectiveA2, FeatureBookE, FeatureMFOCRF,
136 FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
162 FeatureFSqrt, FeatureFRE, FeatureFRES,
163 FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
164 FeatureSTFIWX, FeatureLFIWAX,
137165 FeatureFPRND, FeatureFPCVT, FeatureISEL,
138166 FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
139167 /*, Feature64BitRegs */]>;
140168 def : Processor<"a2q", PPCA2Itineraries,
141169 [DirectiveA2, FeatureBookE, FeatureMFOCRF,
142 FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
170 FeatureFSqrt, FeatureFRE, FeatureFRES,
171 FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
172 FeatureSTFIWX, FeatureLFIWAX,
143173 FeatureFPRND, FeatureFPCVT, FeatureISEL,
144174 FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
145175 /*, Feature64BitRegs */, FeatureQPX]>;
146176 def : Processor<"pwr3", G5Itineraries,
147 [DirectivePwr3, FeatureAltivec, FeatureMFOCRF,
177 [DirectivePwr3, FeatureAltivec,
178 FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
148179 FeatureSTFIWX, Feature64Bit]>;
149180 def : Processor<"pwr4", G5Itineraries,
150181 [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
151 FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
182 FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
183 FeatureSTFIWX, Feature64Bit]>;
152184 def : Processor<"pwr5", G5Itineraries,
153185 [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
154 FeatureFSqrt, FeatureSTFIWX, Feature64Bit]>;
186 FeatureFSqrt, FeatureFRE, FeatureFRES,
187 FeatureFRSQRTE, FeatureFRSQRTES,
188 FeatureSTFIWX, Feature64Bit]>;
155189 def : Processor<"pwr5x", G5Itineraries,
156190 [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
157 FeatureFSqrt, FeatureSTFIWX, FeatureFPRND,
158 Feature64Bit]>;
191 FeatureFSqrt, FeatureFRE, FeatureFRES,
192 FeatureFRSQRTE, FeatureFRSQRTES,
193 FeatureSTFIWX, FeatureFPRND, Feature64Bit]>;
159194 def : Processor<"pwr6", G5Itineraries,
160195 [DirectivePwr6, FeatureAltivec,
161 FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
162 FeatureLFIWAX, FeatureFPRND, Feature64Bit
163 /*, Feature64BitRegs */]>;
196 FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
197 FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
198 FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
199 FeatureFPRND, Feature64Bit /*, Feature64BitRegs */]>;
164200 def : Processor<"pwr6x", G5Itineraries,
165201 [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
166 FeatureFSqrt, FeatureSTFIWX, FeatureLFIWAX,
202 FeatureFSqrt, FeatureFRE, FeatureFRES,
203 FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
204 FeatureSTFIWX, FeatureLFIWAX,
167205 FeatureFPRND, Feature64Bit]>;
168206 def : Processor<"pwr7", G5Itineraries,
169207 [DirectivePwr7, FeatureAltivec,
170 FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
171 FeatureLFIWAX, FeatureFPRND, FeatureFPCVT,
172 FeatureISEL, FeaturePOPCNTD, FeatureLDBRX,
208 FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
209 FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
210 FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
211 FeatureFPRND, FeatureFPCVT, FeatureISEL,
212 FeaturePOPCNTD, FeatureLDBRX,
173213 Feature64Bit /*, Feature64BitRegs */]>;
174214 def : Processor<"ppc", G3Itineraries, [Directive32]>;
175215 def : Processor<"ppc64", G5Itineraries,
149149 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
150150
151151 // If we're enabling GP optimizations, use hardware square root
152 if (!Subtarget->hasFSQRT()) {
152 if (!Subtarget->hasFSQRT() &&
153 !(TM.Options.UnsafeFPMath &&
154 Subtarget->hasFRSQRTE() && Subtarget->hasFRE()))
153155 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
156
157 if (!Subtarget->hasFSQRT() &&
158 !(TM.Options.UnsafeFPMath &&
159 Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
154160 setOperationAction(ISD::FSQRT, MVT::f32, Expand);
155 }
156161
157162 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
158163 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
468473
469474 setOperationAction(ISD::MUL, MVT::v4f32, Legal);
470475 setOperationAction(ISD::FMA, MVT::v4f32, Legal);
476
477 if (TM.Options.UnsafeFPMath) {
478 setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
479 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
480 }
481
471482 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
472483 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
473484 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
517528 setTargetDAGCombine(ISD::STORE);
518529 setTargetDAGCombine(ISD::BR_CC);
519530 setTargetDAGCombine(ISD::BSWAP);
531
532 // Use reciprocal estimates.
533 if (TM.Options.UnsafeFPMath) {
534 setTargetDAGCombine(ISD::FDIV);
535 setTargetDAGCombine(ISD::FSQRT);
536 }
520537
521538 // Darwin long double math library functions have $LDBL128 appended.
522539 if (Subtarget->isDarwin()) {
589606 case PPCISD::FCFID: return "PPCISD::FCFID";
590607 case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
591608 case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
609 case PPCISD::FRE: return "PPCISD::FRE";
610 case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
592611 case PPCISD::STFIWX: return "PPCISD::STFIWX";
593612 case PPCISD::VMADDFP: return "PPCISD::VMADDFP";
594613 case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
66576676 // Target Optimization Hooks
66586677 //===----------------------------------------------------------------------===//
66596678
6679 SDValue PPCTargetLowering::DAGCombineFastRecip(SDNode *N,
6680 DAGCombinerInfo &DCI,
6681 bool UseOperand) const {
6682 if (DCI.isAfterLegalizeVectorOps())
6683 return SDValue();
6684
6685 if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRES()) ||
6686 (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRE()) ||
6687 (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
6688
6689 // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
6690 // For the reciprocal, we need to find the zero of the function:
6691 // F(X) = A X - 1 [which has a zero at X = 1/A]
6692 // =>
6693 // X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
6694 // does not require additional intermediate precision]
6695
6696 // Convergence is quadratic, so we essentially double the number of digits
6697 // correct after every iteration. The minimum architected relative
6698 // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
6699 // 23 digits and double has 52 digits.
6700 int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
6701 if (N->getValueType(0).getScalarType() == MVT::f64)
6702 ++Iterations;
6703
6704 SelectionDAG &DAG = DCI.DAG;
6705 DebugLoc dl = N->getDebugLoc();
6706
6707 SDValue FPOne =
6708 DAG.getConstantFP(1.0, N->getValueType(0).getScalarType());
6709 if (N->getValueType(0).isVector()) {
6710 assert(N->getValueType(0).getVectorNumElements() == 4 &&
6711 "Unknown vector type");
6712 FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0),
6713 FPOne, FPOne, FPOne, FPOne);
6714 }
6715
6716 SDValue Est = DAG.getNode(PPCISD::FRE, dl,
6717 N->getValueType(0),
6718 UseOperand ? N->getOperand(1) :
6719 SDValue(N, 0));
6720 DCI.AddToWorklist(Est.getNode());
6721
6722 // Newton iterations: Est = Est + Est (1 - Arg * Est)
6723 for (int i = 0; i < Iterations; ++i) {
6724 SDValue NewEst = DAG.getNode(ISD::FMUL, dl,
6725 N->getValueType(0),
6726 UseOperand ? N->getOperand(1) :
6727 SDValue(N, 0),
6728 Est);
6729 DCI.AddToWorklist(NewEst.getNode());
6730
6731 NewEst = DAG.getNode(ISD::FSUB, dl,
6732 N->getValueType(0), FPOne, NewEst);
6733 DCI.AddToWorklist(NewEst.getNode());
6734
6735 NewEst = DAG.getNode(ISD::FMUL, dl,
6736 N->getValueType(0), Est, NewEst);
6737 DCI.AddToWorklist(NewEst.getNode());
6738
6739 Est = DAG.getNode(ISD::FADD, dl,
6740 N->getValueType(0), Est, NewEst);
6741 DCI.AddToWorklist(Est.getNode());
6742 }
6743
6744 return Est;
6745 }
6746
6747 return SDValue();
6748 }
6749
6750 SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDNode *N,
6751 DAGCombinerInfo &DCI) const {
6752 if (DCI.isAfterLegalizeVectorOps())
6753 return SDValue();
6754
6755 if ((N->getValueType(0) == MVT::f32 && PPCSubTarget.hasFRSQRTES()) ||
6756 (N->getValueType(0) == MVT::f64 && PPCSubTarget.hasFRSQRTE()) ||
6757 (N->getValueType(0) == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
6758
6759 // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
6760 // For the reciprocal sqrt, we need to find the zero of the function:
6761 // F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
6762 // =>
6763 // X_{i+1} = X_i (1.5 - A X_i^2 / 2)
6764 // As a result, we precompute A/2 prior to the iteration loop.
6765
6766 // Convergence is quadratic, so we essentially double the number of digits
6767 // correct after every iteration. The minimum architected relative
6768 // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
6769 // 23 digits and double has 52 digits.
6770 int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
6771 if (N->getValueType(0).getScalarType() == MVT::f64)
6772 ++Iterations;
6773
6774 SelectionDAG &DAG = DCI.DAG;
6775 DebugLoc dl = N->getDebugLoc();
6776
6777 SDValue FPThreeHalfs =
6778 DAG.getConstantFP(1.5, N->getValueType(0).getScalarType());
6779 if (N->getValueType(0).isVector()) {
6780 assert(N->getValueType(0).getVectorNumElements() == 4 &&
6781 "Unknown vector type");
6782 FPThreeHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, N->getValueType(0),
6783 FPThreeHalfs, FPThreeHalfs,
6784 FPThreeHalfs, FPThreeHalfs);
6785 }
6786
6787 SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl,
6788 N->getValueType(0), N->getOperand(0));
6789 DCI.AddToWorklist(Est.getNode());
6790
6791 // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
6792 // this entire sequence requires only one FP constant.
6793 SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
6794 FPThreeHalfs, N->getOperand(0));
6795 DCI.AddToWorklist(HalfArg.getNode());
6796
6797 HalfArg = DAG.getNode(ISD::FSUB, dl, N->getValueType(0),
6798 HalfArg, N->getOperand(0));
6799 DCI.AddToWorklist(HalfArg.getNode());
6800
6801 // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
6802 for (int i = 0; i < Iterations; ++i) {
6803 SDValue NewEst = DAG.getNode(ISD::FMUL, dl,
6804 N->getValueType(0), Est, Est);
6805 DCI.AddToWorklist(NewEst.getNode());
6806
6807 NewEst = DAG.getNode(ISD::FMUL, dl,
6808 N->getValueType(0), HalfArg, NewEst);
6809 DCI.AddToWorklist(NewEst.getNode());
6810
6811 NewEst = DAG.getNode(ISD::FSUB, dl,
6812 N->getValueType(0), FPThreeHalfs, NewEst);
6813 DCI.AddToWorklist(NewEst.getNode());
6814
6815 Est = DAG.getNode(ISD::FMUL, dl,
6816 N->getValueType(0), Est, NewEst);
6817 DCI.AddToWorklist(Est.getNode());
6818 }
6819
6820 return Est;
6821 }
6822
6823 return SDValue();
6824 }
6825
66606826 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
66616827 DAGCombinerInfo &DCI) const {
66626828 const TargetMachine &TM = getTargetMachine();
66836849 return N->getOperand(0);
66846850 }
66856851 break;
6686
6852 case ISD::FDIV: {
6853 assert(TM.Options.UnsafeFPMath &&
6854 "Reciprocal estimates require UnsafeFPMath");
6855
6856 if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
6857 SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(1).getNode(), DCI);
6858 if (RV.getNode() != 0) {
6859 DCI.AddToWorklist(RV.getNode());
6860 return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
6861 N->getOperand(0), RV);
6862 }
6863 }
6864
6865 SDValue RV = DAGCombineFastRecip(N, DCI);
6866 if (RV.getNode() != 0) {
6867 DCI.AddToWorklist(RV.getNode());
6868 return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
6869 N->getOperand(0), RV);
6870 }
6871
6872 }
6873 break;
6874 case ISD::FSQRT: {
6875 assert(TM.Options.UnsafeFPMath &&
6876 "Reciprocal estimates require UnsafeFPMath");
6877
6878 // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
6879 // reciprocal sqrt.
6880 SDValue RV = DAGCombineFastRecipFSQRT(N, DCI);
6881 if (RV.getNode() != 0) {
6882 DCI.AddToWorklist(RV.getNode());
6883 RV = DAGCombineFastRecip(RV.getNode(), DCI, false);
6884 if (RV.getNode() != 0)
6885 return RV;
6886 }
6887
6888 }
6889 break;
66876890 case ISD::SINT_TO_FP:
66886891 if (TM.getSubtarget().has64BitSupport()) {
66896892 if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
4747 /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
4848 /// unsigned integers.
4949 FCTIDUZ, FCTIWUZ,
50
51 /// Reciprocal estimate instructions (unary FP ops).
52 FRE, FRSQRTE,
5053
5154 // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
5255 // three v4f32 operands and producing a v4f32 result.
619622
620623 SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
621624 SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
625
626 SDValue DAGCombineFastRecip(SDNode *N, DAGCombinerInfo &DCI,
627 bool UseOperand = true) const;
628 SDValue DAGCombineFastRecipFSQRT(SDNode *N, DAGCombinerInfo &DCI) const;
622629 };
623630 }
624631
794794 def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC),
795795 (VPERM $vA, $vB, $vC)>;
796796
797 def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
798 def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;
799
797800 // Vector shifts
798801 def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
799802 (v16i8 (VSLB $vA, $vB))>;
6060 //===----------------------------------------------------------------------===//
6161 // PowerPC specific DAG Nodes.
6262 //
63
64 def PPCfre : SDNode<"PPCISD::FRE", SDTFPUnaryOp, []>;
65 def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
6366
6467 def PPCfcfid : SDNode<"PPCISD::FCFID", SDTFPUnaryOp, []>;
6568 def PPCfcfidu : SDNode<"PPCISD::FCFIDU", SDTFPUnaryOp, []>;
12221225 def FNEGD : XForm_26<63, 40, (outs F8RC:$frD), (ins F8RC:$frB),
12231226 "fneg $frD, $frB", FPGeneral,
12241227 [(set f64:$frD, (fneg f64:$frB))]>;
1225 }
1226
1228
1229 // Reciprocal estimates.
1230 def FRE : XForm_26<63, 24, (outs F8RC:$frD), (ins F8RC:$frB),
1231 "fre $frD, $frB", FPGeneral,
1232 [(set f64:$frD, (PPCfre f64:$frB))]>;
1233 def FRES : XForm_26<59, 24, (outs F4RC:$frD), (ins F4RC:$frB),
1234 "fres $frD, $frB", FPGeneral,
1235 [(set f32:$frD, (PPCfre f32:$frB))]>;
1236 def FRSQRTE : XForm_26<63, 26, (outs F8RC:$frD), (ins F8RC:$frB),
1237 "frsqrte $frD, $frB", FPGeneral,
1238 [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
1239 def FRSQRTES : XForm_26<59, 26, (outs F4RC:$frD), (ins F4RC:$frB),
1240 "frsqrtes $frD, $frB", FPGeneral,
1241 [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
1242 }
12271243
12281244 // XL-Form instructions. condition register logical ops.
12291245 //
16861702
16871703 def : Pat<(atomic_fence (imm), (imm)), (SYNC)>;
16881704
1705 // Additional FNMSUB patterns: -a*c + b == -(a*c - b)
1706 def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
1707 (FNMSUB $A, $C, $B)>;
1708 def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
1709 (FNMSUB $A, $C, $B)>;
1710 def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
1711 (FNMSUBS $A, $C, $B)>;
1712 def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
1713 (FNMSUBS $A, $C, $B)>;
1714
16891715 include "PPCInstrAltivec.td"
16901716 include "PPCInstr64Bit.td"
3737 , HasAltivec(false)
3838 , HasQPX(false)
3939 , HasFSQRT(false)
40 , HasFRE(false)
41 , HasFRES(false)
42 , HasFRSQRTE(false)
43 , HasFRSQRTES(false)
44 , HasRecipPrec(false)
4045 , HasSTFIWX(false)
4146 , HasLFIWAX(false)
4247 , HasFPRND(false)
7676 bool HasAltivec;
7777 bool HasQPX;
7878 bool HasFSQRT;
79 bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
80 bool HasRecipPrec;
7981 bool HasSTFIWX;
8082 bool HasLFIWAX;
8183 bool HasFPRND;
158160
159161 // Specific obvious features.
160162 bool hasFSQRT() const { return HasFSQRT; }
163 bool hasFRE() const { return HasFRE; }
164 bool hasFRES() const { return HasFRES; }
165 bool hasFRSQRTE() const { return HasFRSQRTE; }
166 bool hasFRSQRTES() const { return HasFRSQRTES; }
167 bool hasRecipPrec() const { return HasRecipPrec; }
161168 bool hasSTFIWX() const { return HasSTFIWX; }
162169 bool hasLFIWAX() const { return HasLFIWAX; }
163170 bool hasFPRND() const { return HasFPRND; }
0 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math | FileCheck %s
1 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck -check-prefix=CHECK-SAFE %s
2 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
3 target triple = "powerpc64-unknown-linux-gnu"
4
5 declare double @llvm.sqrt.f64(double)
6 declare float @llvm.sqrt.f32(float)
7 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
8
9 define double @foo(double %a, double %b) nounwind {
10 entry:
11 %x = call double @llvm.sqrt.f64(double %b)
12 %r = fdiv double %a, %x
13 ret double %r
14
15 ; CHECK: @foo
16 ; CHECK: frsqrte
17 ; CHECK: fnmsub
18 ; CHECK: fmul
19 ; CHECK: fmadd
20 ; CHECK: fmul
21 ; CHECK: fmul
22 ; CHECK: fmadd
23 ; CHECK: fmul
24 ; CHECK: fmul
25 ; CHECK: blr
26
27 ; CHECK-SAFE: @foo
28 ; CHECK-SAFE: fsqrt
29 ; CHECK-SAFE: fdiv
30 ; CHECK-SAFE: blr
31 }
32
33 define float @goo(float %a, float %b) nounwind {
34 entry:
35 %x = call float @llvm.sqrt.f32(float %b)
36 %r = fdiv float %a, %x
37 ret float %r
38
39 ; CHECK: @goo
40 ; CHECK: frsqrtes
41 ; CHECK: fnmsubs
42 ; CHECK: fmuls
43 ; CHECK: fmadds
44 ; CHECK: fmuls
45 ; CHECK: fmuls
46 ; CHECK: blr
47
48 ; CHECK-SAFE: @goo
49 ; CHECK-SAFE: fsqrts
50 ; CHECK-SAFE: fdivs
51 ; CHECK-SAFE: blr
52 }
53
54 define <4 x float> @hoo(<4 x float> %a, <4 x float> %b) nounwind {
55 entry:
56 %x = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %b)
57 %r = fdiv <4 x float> %a, %x
58 ret <4 x float> %r
59
60 ; CHECK: @hoo
61 ; CHECK: vrsqrtefp
62
63 ; CHECK-SAFE: @hoo
64 ; CHECK-SAFE-NOT: vrsqrtefp
65 ; CHECK-SAFE: blr
66 }
67
68 define double @foo2(double %a, double %b) nounwind {
69 entry:
70 %r = fdiv double %a, %b
71 ret double %r
72
73 ; CHECK: @foo2
74 ; CHECK: fre
75 ; CHECK: fnmsub
76 ; CHECK: fmadd
77 ; CHECK: fnmsub
78 ; CHECK: fmadd
79 ; CHECK: fmul
80 ; CHECK: blr
81
82 ; CHECK-SAFE: @foo2
83 ; CHECK-SAFE: fdiv
84 ; CHECK-SAFE: blr
85 }
86
87 define float @goo2(float %a, float %b) nounwind {
88 entry:
89 %r = fdiv float %a, %b
90 ret float %r
91
92 ; CHECK: @goo2
93 ; CHECK: fres
94 ; CHECK: fnmsubs
95 ; CHECK: fmadds
96 ; CHECK: fmuls
97 ; CHECK: blr
98
99 ; CHECK-SAFE: @goo2
100 ; CHECK-SAFE: fdivs
101 ; CHECK-SAFE: blr
102 }
103
104 define <4 x float> @hoo2(<4 x float> %a, <4 x float> %b) nounwind {
105 entry:
106 %r = fdiv <4 x float> %a, %b
107 ret <4 x float> %r
108
109 ; CHECK: @hoo2
110 ; CHECK: vrefp
111
112 ; CHECK-SAFE: @hoo2
113 ; CHECK-SAFE-NOT: vrefp
114 ; CHECK-SAFE: blr
115 }
116
117 define double @foo3(double %a) nounwind {
118 entry:
119 %r = call double @llvm.sqrt.f64(double %a)
120 ret double %r
121
122 ; CHECK: @foo3
123 ; CHECK: frsqrte
124 ; CHECK: fnmsub
125 ; CHECK: fmul
126 ; CHECK: fmadd
127 ; CHECK: fmul
128 ; CHECK: fmul
129 ; CHECK: fmadd
130 ; CHECK: fmul
131 ; CHECK: fre
132 ; CHECK: fnmsub
133 ; CHECK: fmadd
134 ; CHECK: fnmsub
135 ; CHECK: fmadd
136 ; CHECK: blr
137
138 ; CHECK-SAFE: @foo3
139 ; CHECK-SAFE: fsqrt
140 ; CHECK-SAFE: blr
141 }
142
143 define float @goo3(float %a) nounwind {
144 entry:
145 %r = call float @llvm.sqrt.f32(float %a)
146 ret float %r
147
148 ; CHECK: @goo3
149 ; CHECK: frsqrtes
150 ; CHECK: fnmsubs
151 ; CHECK: fmuls
152 ; CHECK: fmadds
153 ; CHECK: fmuls
154 ; CHECK: fres
155 ; CHECK: fnmsubs
156 ; CHECK: fmadds
157 ; CHECK: blr
158
159 ; CHECK-SAFE: @goo3
160 ; CHECK-SAFE: fsqrts
161 ; CHECK-SAFE: blr
162 }
163
164 define <4 x float> @hoo3(<4 x float> %a) nounwind {
165 entry:
166 %r = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
167 ret <4 x float> %r
168
169 ; CHECK: @hoo3
170 ; CHECK: vrsqrtefp
171 ; CHECK: vrefp
172
173 ; CHECK-SAFE: @hoo3
174 ; CHECK-SAFE-NOT: vrsqrtefp
175 ; CHECK-SAFE: blr
176 }
177