llvm.org GIT mirror llvm / 88f49fa
[DAGCombiner] try repeated fdiv divisor transform before building estimate This was originally part of D61028, but it's an independent diff. If we try the repeated divisor reciprocal transform before producing an estimate sequence, then we have an opportunity to use scalar fdiv. On x86, the trade-off is 1 divss vs. 5 vector FP ops in the default estimate sequence. On recent chips (Skylake, Ryzen), the full-precision division is only 3 cycle throughput, so that's probably the better perf default option and avoids problems from x86's inaccurate estimates. The last 2 tests show that users still have the option to override the defaults by using the function attributes for reciprocal estimates, but those patterns are potentially made faster by converting the vector ops (including ymm ops) to scalar math. Differential Revision: https://reviews.llvm.org/D61149 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359398 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 1 year, 4 months ago
2 changed file(s) with 30 addition(s) and 42 deletion(s). Raw diff Collapse all Expand all
1199111991 if (SDValue NewSel = foldBinOpIntoSelect(N))
1199211992 return NewSel;
1199311993
11994 if (SDValue V = combineRepeatedFPDivisors(N))
11995 return V;
11996
1199411997 if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
1199511998 // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
1199611999 if (N1CFP) {
1207912082 Flags);
1208012083 }
1208112084 }
12082
12083 if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
12084 return CombineRepeatedDivisors;
1208512085
1208612086 return SDValue();
1208712087 }
5050 define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
5151 ; SSE-LABEL: splat_fdiv_v4f32:
5252 ; SSE: # %bb.0:
53 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
54 ; SSE-NEXT: rcpps %xmm1, %xmm2
55 ; SSE-NEXT: mulps %xmm2, %xmm1
56 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
57 ; SSE-NEXT: subps %xmm1, %xmm3
58 ; SSE-NEXT: mulps %xmm2, %xmm3
59 ; SSE-NEXT: addps %xmm2, %xmm3
60 ; SSE-NEXT: mulps %xmm3, %xmm0
53 ; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
54 ; SSE-NEXT: divss %xmm1, %xmm2
55 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
56 ; SSE-NEXT: mulps %xmm2, %xmm0
6157 ; SSE-NEXT: retq
6258 ;
6359 ; AVX-LABEL: splat_fdiv_v4f32:
6460 ; AVX: # %bb.0:
61 ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
62 ; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
6563 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
66 ; AVX-NEXT: vrcpps %xmm1, %xmm2
67 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
68 ; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
69 ; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
70 ; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
71 ; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
7264 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
7365 ; AVX-NEXT: retq
7466 %vy = insertelement <4 x float> undef, float %y, i32 0
8981 ;
9082 ; AVX-LABEL: splat_fdiv_v8f32:
9183 ; AVX: # %bb.0:
84 ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
85 ; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1
9286 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
9387 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
94 ; AVX-NEXT: vrcpps %ymm1, %ymm2
95 ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
96 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
97 ; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
98 ; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
99 ; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
10088 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
10189 ; AVX-NEXT: retq
10290 %vy = insertelement <8 x float> undef, float %y, i32 0
10896 define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 {
10997 ; SSE-LABEL: splat_fdiv_v4f32_estimate:
11098 ; SSE: # %bb.0:
111 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
112 ; SSE-NEXT: rcpps %xmm1, %xmm2
113 ; SSE-NEXT: mulps %xmm2, %xmm1
114 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
115 ; SSE-NEXT: subps %xmm1, %xmm3
116 ; SSE-NEXT: mulps %xmm2, %xmm3
117 ; SSE-NEXT: addps %xmm2, %xmm3
99 ; SSE-NEXT: rcpss %xmm1, %xmm2
100 ; SSE-NEXT: mulss %xmm2, %xmm1
101 ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
102 ; SSE-NEXT: subss %xmm1, %xmm3
103 ; SSE-NEXT: mulss %xmm2, %xmm3
104 ; SSE-NEXT: addss %xmm2, %xmm3
105 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0,0,0]
118106 ; SSE-NEXT: mulps %xmm3, %xmm0
119107 ; SSE-NEXT: retq
120108 ;
121109 ; AVX-LABEL: splat_fdiv_v4f32_estimate:
122110 ; AVX: # %bb.0:
111 ; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
112 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
113 ; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
114 ; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
115 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
116 ; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
123117 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
124 ; AVX-NEXT: vrcpps %xmm1, %xmm2
125 ; AVX-NEXT: vmulps %xmm2, %xmm1, %xmm1
126 ; AVX-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
127 ; AVX-NEXT: vsubps %xmm1, %xmm3, %xmm1
128 ; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
129 ; AVX-NEXT: vaddps %xmm1, %xmm2, %xmm1
130118 ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
131119 ; AVX-NEXT: retq
132120 %vy = insertelement <4 x float> undef, float %y, i32 0
151139 ;
152140 ; AVX-LABEL: splat_fdiv_v8f32_estimate:
153141 ; AVX: # %bb.0:
142 ; AVX-NEXT: vrcpss %xmm1, %xmm1, %xmm2
143 ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1
144 ; AVX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
145 ; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1
146 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
147 ; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
154148 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
155149 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
156 ; AVX-NEXT: vrcpps %ymm1, %ymm2
157 ; AVX-NEXT: vmulps %ymm2, %ymm1, %ymm1
158 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
159 ; AVX-NEXT: vsubps %ymm1, %ymm3, %ymm1
160 ; AVX-NEXT: vmulps %ymm1, %ymm2, %ymm1
161 ; AVX-NEXT: vaddps %ymm1, %ymm2, %ymm1
162150 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
163151 ; AVX-NEXT: retq
164152 %vy = insertelement <8 x float> undef, float %y, i32 0