llvm.org GIT mirror llvm / fdad177
[DAGCombiner] try repeated fdiv divisor transform before building estimate (2nd try) The original patch was committed at rL359398 and reverted at rL359695 because of infinite looping. This includes a fix to check for a vector splat of "1.0" to avoid the infinite loop. Original commit message: This was originally part of D61028, but it's an independent diff. If we try the repeated divisor reciprocal transform before producing an estimate sequence, then we have an opportunity to use scalar fdiv. On x86, the trade-off is 1 divss vs. 5 vector FP ops in the default estimate sequence. On recent chips (Skylake, Ryzen), the full-precision division is only 3 cycle throughput, so that's probably the better perf default option and avoids problems from x86's inaccurate estimates. The last 2 tests show that users still have the option to override the defaults by using the function attributes for reciprocal estimates, but those patterns are potentially made faster by converting the vector ops (including ymm ops) to scalar math. Differential Revision: https://reviews.llvm.org/D61149 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359793 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 1 year, 4 months ago
2 changed file(s) with 31 addition(s) and 43 deletion(s). Raw diff Collapse all Expand all
1191411914
1191511915 // Skip if current node is a reciprocal.
1191611916 SDValue N0 = N->getOperand(0);
11917 ConstantFPSDNode *N0CFP = dyn_cast(N0);
11917 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true);
1191811918 if (N0CFP && N0CFP->isExactlyValue(1.0))
1191911919 return SDValue();
1192011920
1199111991
1199211992 if (SDValue NewSel = foldBinOpIntoSelect(N))
1199311993 return NewSel;
11994
11995 if (SDValue V = combineRepeatedFPDivisors(N))
11996 return V;
1199411997
1199511998 if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
1199611999 // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
1208012083 Flags);
1208112084 }
1208212085 }
12083
12084 if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
12085 return CombineRepeatedDivisors;
1208612086
1208712087 return SDValue();
1208812088 }
5050 define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
5151 ; SSE-LABEL: splat_fdiv_v4f32:
5252 ; SSE: # %bb.0:
53 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
54 ; SSE-NEXT: rcpps %xmm1, %xmm2
55 ; SSE-NEXT: mulps %xmm2, %xmm1
56 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
57 ; SSE-NEXT: subps %xmm1, %xmm3
58 ; SSE-NEXT: mulps %xmm2, %xmm3
59 ; SSE-NEXT: addps %xmm2, %xmm3
60 ; SSE-NEXT: mulps %xmm3, %xmm0
53 ; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
54 ; SSE-NEXT: divss %xmm1, %xmm2
55 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
56 ; SSE-NEXT: mulps %xmm2, %xmm0
6157 ; SSE-NEXT: retq
6258 ;
6359 ; AVX-LABEL: splat_fdiv_v4f32:
6460 ; AVX: # %bb.0:
61 ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
62 ; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
6563 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
66 ; AVX-NEXT: vrcpps %xmm1, %xmm2
67 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
68 ; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
69 ; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
70 ; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
71 ; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
7264 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
7365 ; AVX-NEXT: retq
7466 %vy = insertelement <4 x float> undef, float %y, i32 0
8981 ;
9082 ; AVX-LABEL: splat_fdiv_v8f32:
9183 ; AVX: # %bb.0:
84 ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
85 ; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
9286 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
9387 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
94 ; AVX-NEXT: vrcpps %ymm1, %ymm2
95 ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
96 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
97 ; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
98 ; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
99 ; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
10088 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
10189 ; AVX-NEXT: retq
10290 %vy = insertelement <8 x float> undef, float %y, i32 0
10896 define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 {
10997 ; SSE-LABEL: splat_fdiv_v4f32_estimate:
11098 ; SSE: # %bb.0:
111 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
112 ; SSE-NEXT: rcpps %xmm1, %xmm2
113 ; SSE-NEXT: mulps %xmm2, %xmm1
114 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
115 ; SSE-NEXT: subps %xmm1, %xmm3
116 ; SSE-NEXT: mulps %xmm2, %xmm3
117 ; SSE-NEXT: addps %xmm2, %xmm3
99 ; SSE-NEXT: rcpss %xmm1, %xmm2
100 ; SSE-NEXT: mulss %xmm2, %xmm1
101 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
102 ; SSE-NEXT: subss %xmm1, %xmm3
103 ; SSE-NEXT: mulss %xmm2, %xmm3
104 ; SSE-NEXT: addss %xmm2, %xmm3
105 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,0,0]
118106 ; SSE-NEXT: mulps %xmm3, %xmm0
119107 ; SSE-NEXT: retq
120108 ;
121109 ; AVX-LABEL: splat_fdiv_v4f32_estimate:
122110 ; AVX: # %bb.0:
111 ; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
112 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
113 ; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
114 ; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
115 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
116 ; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
123117 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
124 ; AVX-NEXT: vrcpps %xmm1, %xmm2
125 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
126 ; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
127 ; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
128 ; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
129 ; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
130118 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
131119 ; AVX-NEXT: retq
132120 %vy = insertelement <4 x float> undef, float %y, i32 0
151139 ;
152140 ; AVX-LABEL: splat_fdiv_v8f32_estimate:
153141 ; AVX: # %bb.0:
142 ; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
143 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
144 ; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
145 ; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
146 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
147 ; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
154148 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
155149 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
156 ; AVX-NEXT: vrcpps %ymm1, %ymm2
157 ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
158 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
159 ; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
160 ; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
161 ; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
162150 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
163151 ; AVX-NEXT: retq
164152 %vy = insertelement <8 x float> undef, float %y, i32 0