llvm.org GIT mirror llvm / c348bc4
[X86][SSE] Improve recognition of i64 sitofp conversions that can be performed as i32 (PR29078) Until AVX512DQ we only support i64/vXi64 sitofp conversion as scalars. This patch sees if the sign bit extends far enough that we can truncate to a i32 type and then perform sitofp without loss of precision. Differential Revision: https://reviews.llvm.org/D24345 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@281502 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 years ago
3 changed file(s) with 59 addition(s) and 151 deletion(s). Raw diff Collapse all Expand all
3123331233 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
3123431234 }
3123531235
31236 // Without AVX512DQ we only support i64 to float scalar conversion. For both
31237 // vectors and scalars, see if we know that the upper bits are all the sign
31238 // bit, in which case we can truncate the input to i32 and convert from that.
31239 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
31240 unsigned BitWidth = InVT.getScalarSizeInBits();
31241 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
31242 if (NumSignBits >= (BitWidth - 31)) {
31243 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
31244 if (InVT.isVector())
31245 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
31246 InVT.getVectorNumElements());
31247 SDLoc dl(N);
31248 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
31249 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
31250 }
31251 }
31252
3123631253 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
3123731254 // a 32-bit target where SSE doesn't support i64->FP operations.
3123831255 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
88 define <2 x double> @mask_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
99 ; X32-SSE-LABEL: mask_sitofp_2i64_2f64:
1010 ; X32-SSE: # BB#0:
11 ; X32-SSE-NEXT: pushl %ebp
12 ; X32-SSE-NEXT: movl %esp, %ebp
13 ; X32-SSE-NEXT: andl $-8, %esp
14 ; X32-SSE-NEXT: subl $32, %esp
11 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1512 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
16 ; X32-SSE-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero
17 ; X32-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
18 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
19 ; X32-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
20 ; X32-SSE-NEXT: fildll {{[0-9]+}}(%esp)
21 ; X32-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
22 ; X32-SSE-NEXT: fildll {{[0-9]+}}(%esp)
23 ; X32-SSE-NEXT: fstpl (%esp)
24 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
25 ; X32-SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
26 ; X32-SSE-NEXT: movl %ebp, %esp
27 ; X32-SSE-NEXT: popl %ebp
13 ; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
2814 ; X32-SSE-NEXT: retl
2915 ;
3016 ; X32-AVX-LABEL: mask_sitofp_2i64_2f64:
3117 ; X32-AVX: # BB#0:
32 ; X32-AVX-NEXT: pushl %ebp
33 ; X32-AVX-NEXT: movl %esp, %ebp
34 ; X32-AVX-NEXT: andl $-8, %esp
35 ; X32-AVX-NEXT: subl $32, %esp
36 ; X32-AVX-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
37 ; X32-AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero
38 ; X32-AVX-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
39 ; X32-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
40 ; X32-AVX-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
41 ; X32-AVX-NEXT: fildll {{[0-9]+}}(%esp)
42 ; X32-AVX-NEXT: fstpl {{[0-9]+}}(%esp)
43 ; X32-AVX-NEXT: fildll {{[0-9]+}}(%esp)
44 ; X32-AVX-NEXT: fstpl (%esp)
45 ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
46 ; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
47 ; X32-AVX-NEXT: movl %ebp, %esp
48 ; X32-AVX-NEXT: popl %ebp
18 ; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u]
19 ; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
4920 ; X32-AVX-NEXT: retl
5021 ;
5122 ; X64-SSE-LABEL: mask_sitofp_2i64_2f64:
5223 ; X64-SSE: # BB#0:
24 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5325 ; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm0
54 ; X64-SSE-NEXT: movd %xmm0, %rax
55 ; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm1
56 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
57 ; X64-SSE-NEXT: movd %xmm0, %rax
58 ; X64-SSE-NEXT: xorps %xmm0, %xmm0
59 ; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm0
60 ; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
61 ; X64-SSE-NEXT: movapd %xmm1, %xmm0
26 ; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
6227 ; X64-SSE-NEXT: retq
6328 ;
6429 ; X64-AVX-LABEL: mask_sitofp_2i64_2f64:
6530 ; X64-AVX: # BB#0:
66 ; X64-AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
67 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
68 ; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
69 ; X64-AVX-NEXT: vmovq %xmm0, %rax
70 ; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
71 ; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
31 ; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u]
32 ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
7233 ; X64-AVX-NEXT: retq
7334 %and = and <2 x i64> %a,
7435 %cvt = sitofp <2 x i64> %and to <2 x double>
147108 define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
148109 ; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
149110 ; X32-SSE: # BB#0:
150 ; X32-SSE-NEXT: pushl %ebp
151 ; X32-SSE-NEXT: movl %esp, %ebp
152 ; X32-SSE-NEXT: andl $-8, %esp
153 ; X32-SSE-NEXT: subl $48, %esp
154111 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
155112 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
156 ; X32-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
157 ; X32-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
158 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
159 ; X32-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
160 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
161 ; X32-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
162 ; X32-SSE-NEXT: fildll {{[0-9]+}}(%esp)
163 ; X32-SSE-NEXT: fstps {{[0-9]+}}(%esp)
164 ; X32-SSE-NEXT: fildll {{[0-9]+}}(%esp)
165 ; X32-SSE-NEXT: fstps (%esp)
166 ; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
167 ; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
168 ; X32-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
169 ; X32-SSE-NEXT: fildll {{[0-9]+}}(%esp)
170 ; X32-SSE-NEXT: fstps {{[0-9]+}}(%esp)
171 ; X32-SSE-NEXT: fildll {{[0-9]+}}(%esp)
172 ; X32-SSE-NEXT: fstps {{[0-9]+}}(%esp)
173 ; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
174 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
175 ; X32-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
176 ; X32-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
177 ; X32-SSE-NEXT: movl %ebp, %esp
178 ; X32-SSE-NEXT: popl %ebp
113 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
114 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
115 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
116 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
179117 ; X32-SSE-NEXT: retl
180118 ;
181119 ; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
182120 ; X32-AVX: # BB#0:
183 ; X32-AVX-NEXT: pushl %ebp
184 ; X32-AVX-NEXT: movl %esp, %ebp
185 ; X32-AVX-NEXT: andl $-8, %esp
186 ; X32-AVX-NEXT: subl $48, %esp
187121 ; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
188 ; X32-AVX-NEXT: vpextrd $1, %xmm0, %eax
189 ; X32-AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
190 ; X32-AVX-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
191 ; X32-AVX-NEXT: vpextrd $3, %xmm0, %eax
192 ; X32-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
193 ; X32-AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
194 ; X32-AVX-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
195 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
196 ; X32-AVX-NEXT: vpextrd $1, %xmm0, %eax
197 ; X32-AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
198 ; X32-AVX-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
199 ; X32-AVX-NEXT: vpextrd $3, %xmm0, %eax
200 ; X32-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
201 ; X32-AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
202 ; X32-AVX-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
203 ; X32-AVX-NEXT: fildll {{[0-9]+}}(%esp)
204 ; X32-AVX-NEXT: fstps {{[0-9]+}}(%esp)
205 ; X32-AVX-NEXT: fildll {{[0-9]+}}(%esp)
206 ; X32-AVX-NEXT: fstps {{[0-9]+}}(%esp)
207 ; X32-AVX-NEXT: fildll {{[0-9]+}}(%esp)
208 ; X32-AVX-NEXT: fstps {{[0-9]+}}(%esp)
209 ; X32-AVX-NEXT: fildll {{[0-9]+}}(%esp)
210 ; X32-AVX-NEXT: fstps (%esp)
211 ; X32-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
212 ; X32-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
213 ; X32-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
214 ; X32-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
215 ; X32-AVX-NEXT: movl %ebp, %esp
216 ; X32-AVX-NEXT: popl %ebp
122 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
123 ; X32-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
124 ; X32-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
125 ; X32-AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
126 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
217127 ; X32-AVX-NEXT: vzeroupper
218128 ; X32-AVX-NEXT: retl
219129 ;
221131 ; X64-SSE: # BB#0:
222132 ; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm0
223133 ; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm1
224 ; X64-SSE-NEXT: movd %xmm1, %rax
225 ; X64-SSE-NEXT: cvtsi2ssq %rax, %xmm3
226 ; X64-SSE-NEXT: movd %xmm0, %rax
227 ; X64-SSE-NEXT: cvtsi2ssq %rax, %xmm2
228 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
229 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
230 ; X64-SSE-NEXT: movd %xmm1, %rax
231 ; X64-SSE-NEXT: xorps %xmm1, %xmm1
232 ; X64-SSE-NEXT: cvtsi2ssq %rax, %xmm1
233 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
234 ; X64-SSE-NEXT: movd %xmm0, %rax
235 ; X64-SSE-NEXT: xorps %xmm0, %xmm0
236 ; X64-SSE-NEXT: cvtsi2ssq %rax, %xmm0
237 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
238 ; X64-SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
239 ; X64-SSE-NEXT: movaps %xmm2, %xmm0
134 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
135 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
136 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
137 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
240138 ; X64-SSE-NEXT: retq
241139 ;
242140 ; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
243141 ; X64-AVX: # BB#0:
244142 ; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
245 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
246 ; X64-AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
247 ; X64-AVX-NEXT: vmovq %xmm0, %rax
248 ; X64-AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
249 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
250 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
251 ; X64-AVX-NEXT: vmovq %xmm0, %rax
252 ; X64-AVX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
253 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
254 ; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
255 ; X64-AVX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
256 ; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
143 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
144 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
145 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
146 ; X64-AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
147 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
257148 ; X64-AVX-NEXT: vzeroupper
258149 ; X64-AVX-NEXT: retq
259150 %and = and <4 x i64> %a,
3838 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3939 ; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2
4040 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax
41 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
42 ; AVX1-NEXT: vmovq %xmm2, %rax
43 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
44 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
41 ; AVX1-NEXT: vmovq %xmm2, %rcx
42 ; AVX1-NEXT: vmovd %ecx, %xmm2
43 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
44 ; AVX1-NEXT: vcvtdq2pd %xmm2, %xmm2
4545 ; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
4646 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
47 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1
48 ; AVX1-NEXT: vmovq %xmm0, %rax
49 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
50 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
47 ; AVX1-NEXT: vmovq %xmm0, %rcx
48 ; AVX1-NEXT: vmovd %ecx, %xmm0
49 ; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
50 ; AVX1-NEXT: vcvtdq2pd %xmm0, %xmm0
5151 ; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0
5252 ; AVX1-NEXT: vmovapd %xmm0, (%rdi)
5353 ; AVX1-NEXT: retq
5858 ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
5959 ; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2
6060 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax
61 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
62 ; AVX2-NEXT: vmovq %xmm2, %rax
63 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
64 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
61 ; AVX2-NEXT: vmovq %xmm2, %rcx
62 ; AVX2-NEXT: vmovd %ecx, %xmm2
63 ; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
64 ; AVX2-NEXT: vcvtdq2pd %xmm2, %xmm2
6565 ; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
6666 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
67 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1
68 ; AVX2-NEXT: vmovq %xmm0, %rax
69 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
70 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
67 ; AVX2-NEXT: vmovq %xmm0, %rcx
68 ; AVX2-NEXT: vmovd %ecx, %xmm0
69 ; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
70 ; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
7171 ; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0
7272 ; AVX2-NEXT: vmovapd %xmm0, (%rdi)
7373 ; AVX2-NEXT: retq