llvm.org GIT mirror llvm / dfd0b9b
X86 SSE: update rsqrtss and rcpss to use two source operands and the first source operand is tied to the destination operand. This is to accurately model the corresponding instructions where the upper bits are unmodified. rdar://12558838 PR14221 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167064 91177308-0d34-0410-b5e6-96231b3b80d8 Manman Ren 7 years ago
2 changed file(s) with 75 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
32923292 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>,
32933293 sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>;
32943294
3295 /// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
3296 multiclass sse1_fp_unop_rw opc, string OpcodeStr, SDNode OpNode,
3297 Intrinsic F32Int, OpndItins itins> {
3298 def SSr : SSI
3299 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3300 [(set FR32:$dst, (OpNode FR32:$src))]>;
3301 // For scalar unary operations, fold a load into the operation
3302 // only in OptForSize mode. It eliminates an instruction, but it also
3303 // eliminates a whole-register clobber (the load), so it introduces a
3304 // partial register update condition.
3305 def SSm : I
3306 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3307 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
3308 Requires<[UseSSE1, OptForSize]>;
3309 let Constraints = "$src1 = $dst" in {
3310 def SSr_Int : SSI
3311 (ins VR128:$src1, VR128:$src2),
3312 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
3313 [], itins.rr>;
3314 def SSm_Int : SSI
3315 (ins VR128:$src1, ssmem:$src2),
3316 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
3317 [], itins.rm>;
3318 }
3319 }
3320
32953321 // Reciprocal approximations. Note that these typically require refinement
32963322 // in order to obtain suitable precision.
3297 defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
3298 SSE_SQRTS>,
3323 defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
3324 SSE_SQRTS>,
32993325 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
33003326 sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
33013327 SSE_SQRTS>;
3302 defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
3303 SSE_RCPS>,
3328 let Predicates = [UseSSE1] in {
3329 def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3330 (RSQRTSSr_Int VR128:$src, VR128:$src)>;
3331 }
3332
3333 defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
3334 SSE_RCPS>,
33043335 sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>,
33053336 sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>;
3337 let Predicates = [UseSSE1] in {
3338 def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3339 (RCPSSr_Int VR128:$src, VR128:$src)>;
3340 }
33063341
33073342 // There is no f64 version of the reciprocal approximation instructions.
33083343
0 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
1
2 ; rdar: 12558838
3 ; PR14221
4 ; There is a mismatch between the intrinsic and the actual instruction.
5 ; The actual instruction has a partial update of dest, while the intrinsic
6 ; passes through the upper FP values. Here, we make sure the source and
7 ; destination of rsqrtss are the same.
8 define void @t1(<4 x float> %a) nounwind uwtable ssp {
9 entry:
10 ; CHECK: t1:
11 ; CHECK: rsqrtss %xmm0, %xmm0
12 %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
13 %a.addr.0.extract = extractelement <4 x float> %0, i32 0
14 %conv = fpext float %a.addr.0.extract to double
15 %a.addr.4.extract = extractelement <4 x float> %0, i32 1
16 %conv3 = fpext float %a.addr.4.extract to double
17 tail call void @callee(double %conv, double %conv3) nounwind
18 ret void
19 }
20 declare void @callee(double, double)
21 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
22
23 define void @t2(<4 x float> %a) nounwind uwtable ssp {
24 entry:
25 ; CHECK: t2:
26 ; CHECK: rcpss %xmm0, %xmm0
27 %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
28 %a.addr.0.extract = extractelement <4 x float> %0, i32 0
29 %conv = fpext float %a.addr.0.extract to double
30 %a.addr.4.extract = extractelement <4 x float> %0, i32 1
31 %conv3 = fpext float %a.addr.4.extract to double
32 tail call void @callee(double %conv, double %conv3) nounwind
33 ret void
34 }
35 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone