llvm.org GIT mirror llvm / 0f9cdd2
[X86][SSE] Reimplement SSE fp2si conversion intrinsics instead of using generic IR D20859 and D20860 attempted to replace the SSE (V)CVTTPS2DQ and VCVTTPD2DQ truncating conversions with generic IR instead. It turns out that the behaviour of these intrinsics is different enough from generic IR that this will cause problems, INF/NAN/out of range values are guaranteed to result in a 0x80000000 value - which plays havoc with constant folding which converts them to either zero or UNDEF. This is also an issue with the scalar implementations (which were already generic IR and what I was trying to match). This patch changes both scalar and packed versions back to using x86-specific builtins. It also deals with the other scalar conversion cases that are runtime rounding mode dependent and can have similar issues with constant folding. A companion clang patch is at D22105 Differential Revision: https://reviews.llvm.org/D22106 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275981 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 years ago
14 changed file(s) with 141 addition(s) and 98 deletion(s). Raw diff Collapse all Expand all
477477 def int_x86_sse2_cvtpd2ps : GCCBuiltin<"__builtin_ia32_cvtpd2ps">,
478478 Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
479479 def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
480 Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
481 def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
480482 Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
481483 def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
482484 Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
15111513 Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
15121514 def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
15131515 Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
1516 def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
1517 Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
15141518 def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
15151519 Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
1520 def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
1521 Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
15161522 }
15171523
15181524 // Vector bit test
14231423 /// integer type Ty is used to select how many bits are available for the
14241424 /// result. Returns null if the conversion cannot be performed, otherwise
14251425 /// returns the Constant value resulting from the conversion.
1426 Constant *ConstantFoldConvertToInt(const APFloat &Val, bool roundTowardZero,
1427 Type *Ty) {
1426 Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
1427 Type *Ty) {
14281428 // All of these conversion intrinsics form an integer of at most 64bits.
14291429 unsigned ResultWidth = Ty->getIntegerBitWidth();
14301430 assert(ResultWidth <= 64 &&
14371437 APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
14381438 /*isSigned=*/true, mode,
14391439 &isExact);
1440 if (status != APFloat::opOK && status != APFloat::opInexact)
1440 if (status != APFloat::opOK &&
1441 (!roundTowardZero || status != APFloat::opInexact))
14411442 return nullptr;
14421443 return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
14431444 }
16751676 case Intrinsic::x86_sse2_cvtsd2si:
16761677 case Intrinsic::x86_sse2_cvtsd2si64:
16771678 if (ConstantFP *FPOp =
1678 dyn_cast_or_null(Op->getAggregateElement(0U)))
1679 return ConstantFoldConvertToInt(FPOp->getValueAPF(),
1680 /*roundTowardZero=*/false, Ty);
1679 dyn_cast_or_null(Op->getAggregateElement(0U)))
1680 return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
1681 /*roundTowardZero=*/false, Ty);
16811682 case Intrinsic::x86_sse_cvttss2si:
16821683 case Intrinsic::x86_sse_cvttss2si64:
16831684 case Intrinsic::x86_sse2_cvttsd2si:
16841685 case Intrinsic::x86_sse2_cvttsd2si64:
16851686 if (ConstantFP *FPOp =
1686 dyn_cast_or_null(Op->getAggregateElement(0U)))
1687 return ConstantFoldConvertToInt(FPOp->getValueAPF(),
1688 /*roundTowardZero=*/true, Ty);
1687 dyn_cast_or_null(Op->getAggregateElement(0U)))
1688 return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
1689 /*roundTowardZero=*/true, Ty);
16891690 }
16901691 }
16911692
250250 Name == "sse2.cvtps2pd" ||
251251 Name == "avx.cvtdq2.pd.256" ||
252252 Name == "avx.cvt.ps2.pd.256" ||
253 Name == "sse2.cvttps2dq" ||
254 Name.startswith("avx.cvtt.") ||
255253 Name.startswith("avx.vinsertf128.") ||
256254 Name == "avx2.vinserti128" ||
257255 Name.startswith("avx.vextractf128.") ||
711709 Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
712710 else
713711 Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
714 } else if (IsX86 && (Name == "sse2.cvttps2dq" ||
715 Name.startswith("avx.cvtt."))) {
716 // Truncation (round to zero) float/double to i32 vector conversion.
717 Value *Src = CI->getArgOperand(0);
718 VectorType *DstTy = cast(CI->getType());
719 Rep = Builder.CreateFPToSI(Src, DstTy, "cvtt");
720712 } else if (IsX86 && Name.startswith("sse4a.movnt.")) {
721713 Module *M = F->getParent();
722714 SmallVector Elts;
20082008 // SSE2 packed instructions with XS prefix
20092009 def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
20102010 "cvttps2dq\t{$src, $dst|$dst, $src}",
2011 [], IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
2011 [(set VR128:$dst,
2012 (int_x86_sse2_cvttps2dq VR128:$src))],
2013 IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
20122014 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
20132015 "cvttps2dq\t{$src, $dst|$dst, $src}",
2014 [], IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2016 [(set VR128:$dst, (int_x86_sse2_cvttps2dq
2017 (loadv4f32 addr:$src)))],
2018 IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
20152019 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
20162020 "cvttps2dq\t{$src, $dst|$dst, $src}",
2017 [], IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2021 [(set VR256:$dst,
2022 (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
2023 IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
20182024 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
20192025 "cvttps2dq\t{$src, $dst|$dst, $src}",
2020 [], IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
2026 [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
2027 (loadv8f32 addr:$src)))],
2028 IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
20212029 Sched<[WriteCvtF2ILd]>;
20222030
20232031 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
20242032 "cvttps2dq\t{$src, $dst|$dst, $src}",
2025 [], IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2033 [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
2034 IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
20262035 def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
20272036 "cvttps2dq\t{$src, $dst|$dst, $src}",
2028 [], IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2037 [(set VR128:$dst,
2038 (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
2039 IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
20292040
20302041 let Predicates = [HasAVX] in {
20312042 def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
20952106 // YMM only
20962107 def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
20972108 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2098 [], IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2109 [(set VR128:$dst,
2110 (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
2111 IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
20992112 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
21002113 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2101 [], IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2114 [(set VR128:$dst,
2115 (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
2116 IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
21022117 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
21032118 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
21042119
680680 ; X64-NEXT: vcvttpd2dqy %ymm0, %xmm0
681681 ; X64-NEXT: vzeroupper
682682 ; X64-NEXT: retq
683 %cvt = fptosi <4 x double> %a0 to <4 x i32>
683 %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
684684 %res = bitcast <4 x i32> %cvt to <2 x i64>
685685 ret <2 x i64> %res
686686 }
687 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
687688
688689 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
689690 ; X32-LABEL: test_mm256_cvttps_epi32:
695696 ; X64: # BB#0:
696697 ; X64-NEXT: vcvttps2dq %ymm0, %ymm0
697698 ; X64-NEXT: retq
698 %cvt = fptosi <8 x float> %a0 to <8 x i32>
699 %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
699700 %res = bitcast <8 x i32> %cvt to <4 x i64>
700701 ret <4 x i64> %res
701702 }
703 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
702704
703705 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
704706 ; X32-LABEL: test_mm256_div_pd:
358358 declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
359359
360360
361 define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
362 ; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
363 ; CHECK: ## BB#0:
364 ; CHECK-NEXT: vcvttpd2dqy %ymm0, %xmm0
365 ; CHECK-NEXT: vzeroupper
366 ; CHECK-NEXT: retl
367 %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
368 ret <4 x i32> %res
369 }
370 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
371
372
373 define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
374 ; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
375 ; CHECK: ## BB#0:
376 ; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
377 ; CHECK-NEXT: retl
378 %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
379 ret <8 x i32> %res
380 }
381 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
382
383
384361 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
385362 ; add operation forces the execution domain.
386363 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
387364 ; CHECK: ## BB#0:
388365 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
389 ; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0
366 ; CHECK-NEXT: vpaddb LCPI32_0, %xmm0, %xmm0
390367 ; CHECK-NEXT: vmovdqu %xmm0, (%eax)
391368 ; CHECK-NEXT: retl
392369 %a2 = add <16 x i8> %a1,
34303430 declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
34313431
34323432
3433 define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
3434 ; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
3435 ; AVX: ## BB#0:
3436 ; AVX-NEXT: vcvttpd2dqy %ymm0, %xmm0
3437 ; AVX-NEXT: vzeroupper
3438 ; AVX-NEXT: retl
3439 ;
3440 ; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
3441 ; AVX512VL: ## BB#0:
3442 ; AVX512VL-NEXT: vcvttpd2dqy %ymm0, %xmm0
3443 ; AVX512VL-NEXT: retl
3444 %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
3445 ret <4 x i32> %res
3446 }
3447 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
3448
3449
3450 define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
3451 ; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
3452 ; AVX: ## BB#0:
3453 ; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
3454 ; AVX-NEXT: retl
3455 ;
3456 ; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
3457 ; AVX512VL: ## BB#0:
3458 ; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0
3459 ; AVX512VL-NEXT: retl
3460 %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
3461 ret <8 x i32> %res
3462 }
3463 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
3464
3465
34333466 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
34343467 ; AVX-LABEL: test_x86_avx_dp_ps_256:
34353468 ; AVX: ## BB#0:
45514584 ; AVX-LABEL: movnt_dq:
45524585 ; AVX: ## BB#0:
45534586 ; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
4554 ; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
4587 ; AVX-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
45554588 ; AVX-NEXT: vmovntdq %ymm0, (%eax)
45564589 ; AVX-NEXT: vzeroupper
45574590 ; AVX-NEXT: retl
45594592 ; AVX512VL-LABEL: movnt_dq:
45604593 ; AVX512VL: ## BB#0:
45614594 ; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
4562 ; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0
4595 ; AVX512VL-NEXT: vpaddq LCPI256_0, %xmm0, %xmm0
45634596 ; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
45644597 ; AVX512VL-NEXT: retl
45654598 %a2 = add <2 x i64> %a1,
55 define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
66 ; X64-LABEL: test_mm_cvtsi64_ss:
77 ; X64: # BB#0:
8 ; X64-NEXT: cvtsi2ssq %rdi, %xmm1
9 ; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
8 ; X64-NEXT: cvtsi2ssq %rdi, %xmm0
109 ; X64-NEXT: retq
11 %cvt = sitofp i64 %a1 to float
12 %res = insertelement <4 x float> %a0, float %cvt, i32 0
10 %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
1311 ret <4 x float> %res
1412 }
13 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
1514
1615 define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
1716 ; X64-LABEL: test_mm_cvtss_si64:
2827 ; X64: # BB#0:
2928 ; X64-NEXT: cvttss2si %xmm0, %rax
3029 ; X64-NEXT: retq
31 %cvt = extractelement <4 x float> %a0, i32 0
32 %res = fptosi float %cvt to i64
30 %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
3331 ret i64 %res
3432 }
33 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
706706 define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
707707 ; X32-LABEL: test_mm_cvtsi32_ss:
708708 ; X32: # BB#0:
709 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
710 ; X32-NEXT: cvtsi2ssl %eax, %xmm1
711 ; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
709 ; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
712710 ; X32-NEXT: retl
713711 ;
714712 ; X64-LABEL: test_mm_cvtsi32_ss:
715713 ; X64: # BB#0:
716 ; X64-NEXT: cvtsi2ssl %edi, %xmm1
717 ; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
718 ; X64-NEXT: retq
719 %cvt = sitofp i32 %a1 to float
720 %res = insertelement <4 x float> %a0, float %cvt, i32 0
721 ret <4 x float> %res
722 }
714 ; X64-NEXT: cvtsi2ssl %edi, %xmm0
715 ; X64-NEXT: retq
716 %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
717 ret <4 x float> %res
718 }
719 declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
723720
724721 define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
725722 ; X32-LABEL: test_mm_cvtss_f32:
761758 ; X64: # BB#0:
762759 ; X64-NEXT: cvttss2si %xmm0, %eax
763760 ; X64-NEXT: retq
764 %cvt = extractelement <4 x float> %a0, i32 0
765 %res = fptosi float %cvt to i32
761 %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
766762 ret i32 %res
767763 }
764 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
768765
769766 define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
770767 ; X32-LABEL: test_mm_cvttss_si32:
776773 ; X64: # BB#0:
777774 ; X64-NEXT: cvttss2si %xmm0, %eax
778775 ; X64-NEXT: retq
779 %cvt = extractelement <4 x float> %a0, i32 0
780 %res = fptosi float %cvt to i32
776 %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
781777 ret i32 %res
782778 }
783779
2424 define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
2525 ; X64-LABEL: test_mm_cvtsi64_sd:
2626 ; X64: # BB#0:
27 ; X64-NEXT: cvtsi2sdq %rdi, %xmm1
28 ; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
27 ; X64-NEXT: cvtsi2sdq %rdi, %xmm0
2928 ; X64-NEXT: retq
30 %cvt = sitofp i64 %a1 to double
31 %res = insertelement <2 x double> %a0, double %cvt, i32 0
29 %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1)
3230 ret <2 x double> %res
3331 }
32 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
3433
3534 define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
3635 ; X64-LABEL: test_mm_cvtsi64_si128:
4746 ; X64: # BB#0:
4847 ; X64-NEXT: cvttsd2si %xmm0, %rax
4948 ; X64-NEXT: retq
50 %ext = extractelement <2 x double> %a0, i32 0
51 %res = fptosi double %ext to i64
49 %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
5250 ret i64 %res
5351 }
52 declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
5453
5554 define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
5655 ; X64-LABEL: test_mm_loadu_si64:
12071207 }
12081208 declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
12091209
1210 define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) {
1211 ; X32-LABEL: test_mm_cvtsd_ss:
1212 ; X32: # BB#0:
1213 ; X32-NEXT: cvtsd2ss %xmm1, %xmm0
1214 ; X32-NEXT: retl
1215 ;
1216 ; X64-LABEL: test_mm_cvtsd_ss:
1217 ; X64: # BB#0:
1218 ; X64-NEXT: cvtsd2ss %xmm1, %xmm0
1219 ; X64-NEXT: retq
1220 %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
1221 ret <4 x float> %res
1222 }
1223 declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
1224
12101225 define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
12111226 ; X32-LABEL: test_mm_cvtsi128_si32:
12121227 ; X32: # BB#0:
13021317 ; X64: # BB#0:
13031318 ; X64-NEXT: cvttps2dq %xmm0, %xmm0
13041319 ; X64-NEXT: retq
1305 %res = fptosi <4 x float> %a0 to <4 x i32>
1320 %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
13061321 %bc = bitcast <4 x i32> %res to <2 x i64>
13071322 ret <2 x i64> %bc
13081323 }
1324 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
13091325
13101326 define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
13111327 ; X32-LABEL: test_mm_cvttsd_si32:
13171333 ; X64: # BB#0:
13181334 ; X64-NEXT: cvttsd2si %xmm0, %eax
13191335 ; X64-NEXT: retq
1320 %ext = extractelement <2 x double> %a0, i32 0
1321 %res = fptosi double %ext to i32
1336 %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
13221337 ret i32 %res
13231338 }
1339 declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
13241340
13251341 define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
13261342 ; X32-LABEL: test_mm_div_pd:
6565 declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
6666
6767
68 define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
69 ; CHECK-LABEL: test_x86_sse2_cvttps2dq:
70 ; CHECK: ## BB#0:
71 ; CHECK-NEXT: cvttps2dq %xmm0, %xmm0
72 ; CHECK-NEXT: retl
73 %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
74 ret <4 x i32> %res
75 }
76 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
77
78
7968 define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
8069 ; CHECK-LABEL: test_x86_sse2_storel_dq:
8170 ; CHECK: ## BB#0:
9382 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
9483 ; CHECK: ## BB#0:
9584 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
96 ; CHECK-NEXT: paddb LCPI8_0, %xmm0
85 ; CHECK-NEXT: paddb LCPI7_0, %xmm0
9786 ; CHECK-NEXT: movdqu %xmm0, (%eax)
9887 ; CHECK-NEXT: retl
9988 %a2 = add <16 x i8> %a1,
None ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s --check-prefix=SSE
22 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=KNL
33
321321 declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
322322
323323
324 define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
325 ; SSE-LABEL: test_x86_sse2_cvttps2dq:
326 ; SSE: ## BB#0:
327 ; SSE-NEXT: cvttps2dq %xmm0, %xmm0
328 ; SSE-NEXT: retl
329 ;
330 ; KNL-LABEL: test_x86_sse2_cvttps2dq:
331 ; KNL: ## BB#0:
332 ; KNL-NEXT: vcvttps2dq %xmm0, %xmm0
333 ; KNL-NEXT: retl
334 %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
335 ret <4 x i32> %res
336 }
337 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
338
339
324340 define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
325341 ; SSE-LABEL: test_x86_sse2_cvttsd2si:
326342 ; SSE: ## BB#0:
192192 ret i1 %b
193193 }
194194
195 ; TODO: Inexact values should not fold as they are dependent on rounding mode
195 ; Inexact values should not fold as they are dependent on rounding mode
196196 define i1 @test_sse_cvts_inexact() nounwind readnone {
197197 ; CHECK-LABEL: @test_sse_cvts_inexact(
198 ; CHECK-NOT: call
199 ; CHECK: ret i1 true
198 ; CHECK: call
199 ; CHECK: call
200 ; CHECK: call
201 ; CHECK: call
200202 entry:
201203 %i0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> ) nounwind
202204 %i1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> ) nounwind