llvm.org GIT mirror llvm / b28d4b4
X86-specific path: Implemented the fusing of MUL+ADDSUB to FMADDSUB. Differential Revision: https://reviews.llvm.org/D28087 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291473 91177308-0d34-0410-b5e6-96231b3b80d8 Vyacheslav Klochkov 3 years ago
2 changed file(s) with 285 addition(s) and 43 deletion(s). Raw diff Collapse all Expand all
69616961 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
69626962 }
69636963
6964 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6965 /// node.
6966 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6967 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6964 /// Returns true iff \p BV builds a vector with the result equivalent to
6965 /// the result of ADDSUB operation.
6966 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
6967 /// are written to the parameters \p Opnd0 and \p Opnd1.
6968 static bool isAddSub(const BuildVectorSDNode *BV,
6969 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6970 SDValue &Opnd0, SDValue &Opnd1) {
6971
69686972 MVT VT = BV->getSimpleValueType(0);
69696973 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6970 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6971 return SDValue();
6972
6973 SDLoc DL(BV);
6974 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
6975 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
6976 return false;
6977
69746978 unsigned NumElts = VT.getVectorNumElements();
69756979 SDValue InVec0 = DAG.getUNDEF(VT);
69766980 SDValue InVec1 = DAG.getUNDEF(VT);
6977
6978 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6979 VT == MVT::v2f64) && "build_vector with an invalid type found!");
69806981
69816982 // Odd-numbered elements in the input build vector are obtained from
69826983 // adding two integer/float elements.
69997000
70007001 // Early exit if we found an unexpected opcode.
70017002 if (Opcode != ExpectedOpcode)
7002 return SDValue();
7003 return false;
70037004
70047005 SDValue Op0 = Op.getOperand(0);
70057006 SDValue Op1 = Op.getOperand(1);
70127013 !isa(Op0.getOperand(1)) ||
70137014 !isa(Op1.getOperand(1)) ||
70147015 Op0.getOperand(1) != Op1.getOperand(1))
7015 return SDValue();
7016 return false;
70167017
70177018 unsigned I0 = cast(Op0.getOperand(1))->getZExtValue();
70187019 if (I0 != i)
7019 return SDValue();
7020 return false;
70207021
70217022 // We found a valid add/sub node. Update the information accordingly.
70227023 if (i & 1)
70287029 if (InVec0.isUndef()) {
70297030 InVec0 = Op0.getOperand(0);
70307031 if (InVec0.getSimpleValueType() != VT)
7031 return SDValue();
7032 return false;
70327033 }
70337034 if (InVec1.isUndef()) {
70347035 InVec1 = Op1.getOperand(0);
70357036 if (InVec1.getSimpleValueType() != VT)
7036 return SDValue();
7037 return false;
70377038 }
70387039
70397040 // Make sure that operands in input to each add/sub node always
70407041 // come from a same pair of vectors.
70417042 if (InVec0 != Op0.getOperand(0)) {
70427043 if (ExpectedOpcode == ISD::FSUB)
7043 return SDValue();
7044 return false;
70447045
70457046 // FADD is commutable. Try to commute the operands
70467047 // and then test again.
70477048 std::swap(Op0, Op1);
70487049 if (InVec0 != Op0.getOperand(0))
7049 return SDValue();
7050 return false;
70507051 }
70517052
70527053 if (InVec1 != Op1.getOperand(0))
7053 return SDValue();
7054 return false;
70547055
70557056 // Update the pair of expected opcodes.
70567057 std::swap(ExpectedOpcode, NextExpectedOpcode);
70577058 }
70587059
70597060 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7060 if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
7061 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
7062
7063 return SDValue();
7061 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7062 return false;
7063
7064 Opnd0 = InVec0;
7065 Opnd1 = InVec1;
7066 return true;
7067 }
7068
7069 /// Returns true if is possible to fold MUL and an idiom that has already been
7070 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7071 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7072 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7073 ///
7074 /// Prior to calling this function it should be known that there is some
7075 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7076 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7077 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7078 /// of \p Opnd0 uses is expected to be equal to 2.
7079 /// For example, this function may be called for the following IR:
7080 /// %AB = fmul fast <2 x double> %A, %B
7081 /// %Sub = fsub fast <2 x double> %AB, %C
7082 /// %Add = fadd fast <2 x double> %AB, %C
7083 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7084 /// <2 x i32>
7085 /// There is a def for %Addsub here, which potentially can be replaced by
7086 /// X86ISD::ADDSUB operation:
7087 /// %Addsub = X86ISD::ADDSUB %AB, %C
7088 /// and such ADDSUB can further be replaced with FMADDSUB:
7089 /// %Addsub = FMADDSUB %A, %B, %C.
7090 ///
7091 /// The main reason why this method is called before the replacement of the
7092 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7093 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7094 /// FMADDSUB is.
7095 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7096 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7097 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7098 !Subtarget.hasAnyFMA())
7099 return false;
7100
7101 // FIXME: These checks must match the similar ones in
7102 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7103 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7104 // or MUL + ADDSUB to FMADDSUB.
7105 const TargetOptions &Options = DAG.getTarget().Options;
7106 bool AllowFusion =
7107 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7108 if (!AllowFusion)
7109 return false;
7110
7111 Opnd2 = Opnd1;
7112 Opnd1 = Opnd0.getOperand(1);
7113 Opnd0 = Opnd0.getOperand(0);
7114
7115 return true;
7116 }
7117
7118 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7119 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7120 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7121 const X86Subtarget &Subtarget,
7122 SelectionDAG &DAG) {
7123 SDValue Opnd0, Opnd1;
7124 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7125 return SDValue();
7126
7127 MVT VT = BV->getSimpleValueType(0);
7128 SDLoc DL(BV);
7129
7130 // Try to generate X86ISD::FMADDSUB node here.
7131 SDValue Opnd2;
7132 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7133 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7134
7135 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7136 // the ADDSUB idiom has been successfully recognized. There are no known
7137 // X86 targets with 512-bit ADDSUB instructions!
7138 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7139 // recognition.
7140 if (VT.is512BitVector())
7141 return SDValue();
7142
7143 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
70647144 }
70657145
70667146 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
72897369 return VectorConstant;
72907370
72917371 BuildVectorSDNode *BV = cast(Op.getNode());
7292 if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
7372 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
72937373 return AddSub;
72947374 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
72957375 return HorizontalOp;
2778327863 return SDValue();
2778427864 }
2778527865
27786 /// \brief Try to combine a shuffle into a target-specific add-sub node.
27866 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
27867 /// operation. If true is returned then the operands of ADDSUB operation
27868 /// are written to the parameters \p Opnd0 and \p Opnd1.
2778727869 ///
27788 /// We combine this directly on the abstract vector shuffle nodes so it is
27789 /// easier to generically match. We also insert dummy vector shuffle nodes for
27790 /// the operands which explicitly discard the lanes which are unused by this
27791 /// operation to try to flow through the rest of the combiner the fact that
27792 /// they're unused.
27793 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
27794 SelectionDAG &DAG) {
27795 SDLoc DL(N);
27870 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
27871 /// so it is easier to generically match. We also insert dummy vector shuffle
27872 /// nodes for the operands which explicitly discard the lanes which are unused
27873 /// by this operation to try to flow through the rest of the combiner
27874 /// the fact that they're unused.
27875 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
27876 SDValue &Opnd0, SDValue &Opnd1) {
27877
2779627878 EVT VT = N->getValueType(0);
2779727879 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
27798 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
27799 return SDValue();
27880 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
27881 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
27882 return false;
2780027883
2780127884 // We only handle target-independent shuffles.
2780227885 // FIXME: It would be easy and harmless to use the target shuffle mask
2780327886 // extraction tool to support more.
2780427887 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
27805 return SDValue();
27888 return false;
2780627889
2780727890 ArrayRef OrigMask = cast(N)->getMask();
27808 SmallVector8> Mask(OrigMask.begin(), OrigMask.end());
27891 SmallVector16> Mask(OrigMask.begin(), OrigMask.end());
2780927892
2781027893 SDValue V1 = N->getOperand(0);
2781127894 SDValue V2 = N->getOperand(1);
2781627899 ShuffleVectorSDNode::commuteMask(Mask);
2781727900 std::swap(V1, V2);
2781827901 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
27819 return SDValue();
27902 return false;
2782027903
2782127904 // If there are other uses of these operations we can't fold them.
2782227905 if (!V1->hasOneUse() || !V2->hasOneUse())
27823 return SDValue();
27906 return false;
2782427907
2782527908 // Ensure that both operations have the same operands. Note that we can
2782627909 // commute the FADD operands.
2782727910 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
2782827911 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
2782927912 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
27830 return SDValue();
27913 return false;
2783127914
2783227915 // We're looking for blends between FADD and FSUB nodes. We insist on these
2783327916 // nodes being lined up in a specific expected pattern.
2783427917 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
2783527918 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
27836 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
27919 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
27920 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
27921 8, 25, 10, 27, 12, 29, 14, 31})))
27922 return false;
27923
27924 Opnd0 = LHS;
27925 Opnd1 = RHS;
27926 return true;
27927 }
27928
27929 /// \brief Try to combine a shuffle into a target-specific add-sub or
27930 /// mul-add-sub node.
27931 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
27932 const X86Subtarget &Subtarget,
27933 SelectionDAG &DAG) {
27934 SDValue Opnd0, Opnd1;
27935 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
2783727936 return SDValue();
2783827937
27839 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
27938 EVT VT = N->getValueType(0);
27939 SDLoc DL(N);
27940
27941 // Try to generate X86ISD::FMADDSUB node here.
27942 SDValue Opnd2;
27943 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
27944 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
27945
27946 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
27947 // the ADDSUB idiom has been successfully recognized. There are no known
27948 // X86 targets with 512-bit ADDSUB instructions!
27949 if (VT.is512BitVector())
27950 return SDValue();
27951
27952 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
2784027953 }
2784127954
2784227955 // We are looking for a shuffle where both sources are concatenated with undef
2789828011 // If we have legalized the vector types, look for blends of FADD and FSUB
2789928012 // nodes that we can fuse into an ADDSUB node.
2790028013 if (TLI.isTypeLegal(VT))
27901 if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
28014 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
2790228015 return AddSub;
2790328016
2790428017 // During Type Legalization, when promoting illegal vector types,
0 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s
1 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s
3
4 ; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
5
6 define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
7 ; FMA3-LABEL: mul_addsub_pd128:
8 ; FMA3: # BB#0: # %entry
9 ; FMA3-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
10 ; FMA3-NEXT: retq
11 ;
12 ; FMA4-LABEL: mul_addsub_pd128:
13 ; FMA4: # BB#0: # %entry
14 ; FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
15 ; FMA4-NEXT: retq
16 entry:
17 %AB = fmul <2 x double> %A, %B
18 %Sub = fsub <2 x double> %AB, %C
19 %Add = fadd <2 x double> %AB, %C
20 %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32>
21 ret <2 x double> %Addsub
22 }
23
24 define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
25 ; FMA3-LABEL: mul_addsub_ps128:
26 ; FMA3: # BB#0: # %entry
27 ; FMA3-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
28 ; FMA3-NEXT: retq
29 ;
30 ; FMA4-LABEL: mul_addsub_ps128:
31 ; FMA4: # BB#0: # %entry
32 ; FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
33 ; FMA4-NEXT: retq
34 entry:
35 %AB = fmul <4 x float> %A, %B
36 %Sub = fsub <4 x float> %AB, %C
37 %Add = fadd <4 x float> %AB, %C
38 %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32>
39 ret <4 x float> %Addsub
40 }
41
42 define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
43 ; FMA3-LABEL: mul_addsub_pd256:
44 ; FMA3: # BB#0: # %entry
45 ; FMA3-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
46 ; FMA3-NEXT: retq
47 ;
48 ; FMA4-LABEL: mul_addsub_pd256:
49 ; FMA4: # BB#0: # %entry
50 ; FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
51 ; FMA4-NEXT: retq
52 entry:
53 %AB = fmul <4 x double> %A, %B
54 %Sub = fsub <4 x double> %AB, %C
55 %Add = fadd <4 x double> %AB, %C
56 %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32>
57 ret <4 x double> %Addsub
58 }
59
60 define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
61 ; FMA3-LABEL: mul_addsub_ps256:
62 ; FMA3: # BB#0: # %entry
63 ; FMA3-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
64 ; FMA3-NEXT: retq
65 ;
66 ; FMA4-LABEL: mul_addsub_ps256:
67 ; FMA4: # BB#0: # %entry
68 ; FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
69 ; FMA4-NEXT: retq
70 entry:
71 %AB = fmul <8 x float> %A, %B
72 %Sub = fsub <8 x float> %AB, %C
73 %Add = fadd <8 x float> %AB, %C
74 %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32>
75 ret <8 x float> %Addsub
76 }
77
78 define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
79 ; FMA3_256-LABEL: mul_addsub_pd512:
80 ; FMA3_256: # BB#0: # %entry
81 ; FMA3_256-NEXT: vfmaddsub213pd %ymm4, %ymm2, %ymm0
82 ; FMA3_256-NEXT: vfmaddsub213pd %ymm5, %ymm3, %ymm1
83 ; FMA3_256-NEXT: retq
84 ;
85 ; FMA3_512-LABEL: mul_addsub_pd512:
86 ; FMA3_512: # BB#0: # %entry
87 ; FMA3_512-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0
88 ; FMA3_512-NEXT: retq
89 ;
90 ; FMA4-LABEL: mul_addsub_pd512:
91 ; FMA4: # BB#0: # %entry
92 ; FMA4-NEXT: vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
93 ; FMA4-NEXT: vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
94 ; FMA4-NEXT: retq
95 entry:
96 %AB = fmul <8 x double> %A, %B
97 %Sub = fsub <8 x double> %AB, %C
98 %Add = fadd <8 x double> %AB, %C
99 %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32>
100 ret <8 x double> %Addsub
101 }
102
103 define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
104 ; FMA3_256-LABEL: mul_addsub_ps512:
105 ; FMA3_256: # BB#0: # %entry
106 ; FMA3_256-NEXT: vfmaddsub213ps %ymm4, %ymm2, %ymm0
107 ; FMA3_256-NEXT: vfmaddsub213ps %ymm5, %ymm3, %ymm1
108 ; FMA3_256-NEXT: retq
109 ;
110 ; FMA3_512-LABEL: mul_addsub_ps512:
111 ; FMA3_512: # BB#0: # %entry
112 ; FMA3_512-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0
113 ; FMA3_512-NEXT: retq
114 ;
115 ; FMA4-LABEL: mul_addsub_ps512:
116 ; FMA4: # BB#0: # %entry
117 ; FMA4-NEXT: vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
118 ; FMA4-NEXT: vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
119 ; FMA4-NEXT: retq
120 entry:
121 %AB = fmul <16 x float> %A, %B
122 %Sub = fsub <16 x float> %AB, %C
123 %Add = fadd <16 x float> %AB, %C
124 %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32>
125 ret <16 x float> %Addsub
126 }
127
128 attributes #0 = { nounwind "unsafe-fp-math"="true" }