llvm.org GIT mirror llvm / 63ec90a
SSE 4.1 Intrinsics and detection git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@46681 91177308-0d34-0410-b5e6-96231b3b80d8 Nate Begeman 12 years ago
7 changed file(s) with 269 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
673673 }
674674
675675 //===----------------------------------------------------------------------===//
676 // SSE4.1
677
678 // FP rounding ops
679 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
680 def int_x86_sse41_round_ss : GCCBuiltin<"__builtin_ia32_roundss">,
681 Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty,
682 llvm_i32_ty], [IntrNoMem]>;
683 def int_x86_sse41_round_ps : GCCBuiltin<"__builtin_ia32_roundps">,
684 Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty,
685 llvm_i32_ty], [IntrNoMem]>;
686 def int_x86_sse41_round_sd : GCCBuiltin<"__builtin_ia32_roundsd">,
687 Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty,
688 llvm_i32_ty], [IntrNoMem]>;
689 def int_x86_sse41_round_pd : GCCBuiltin<"__builtin_ia32_roundpd">,
690 Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty,
691 llvm_i32_ty], [IntrNoMem]>;
692 }
693
694 // Vector sign and zero extend
695 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
696 def int_x86_sse41_pmovsxbd : GCCBuiltin<"__builtin_ia32_pmovsxbd128">,
697 Intrinsic<[llvm_v4i32_ty, llvm_v16i8_ty]>;
698 def int_x86_sse41_pmovsxbq : GCCBuiltin<"__builtin_ia32_pmovsxbq128">,
699 Intrinsic<[llvm_v2i64_ty, llvm_v16i8_ty]>;
700 def int_x86_sse41_pmovsxbw : GCCBuiltin<"__builtin_ia32_pmovsxbw128">,
701 Intrinsic<[llvm_v8i16_ty, llvm_v16i8_ty]>;
702 def int_x86_sse41_pmovsxdq : GCCBuiltin<"__builtin_ia32_pmovsxdq128">,
703 Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty]>;
704 def int_x86_sse41_pmovsxwd : GCCBuiltin<"__builtin_ia32_pmovsxwd128">,
705 Intrinsic<[llvm_v4i32_ty, llvm_v8i16_ty]>;
706 def int_x86_sse41_pmovsxwq : GCCBuiltin<"__builtin_ia32_pmovsxwq128">,
707 Intrinsic<[llvm_v2i64_ty, llvm_v8i16_ty]>;
708 def int_x86_sse41_pmovzxbd : GCCBuiltin<"__builtin_ia32_pmovzxbd128">,
709 Intrinsic<[llvm_v4i32_ty, llvm_v16i8_ty]>;
710 def int_x86_sse41_pmovzxbq : GCCBuiltin<"__builtin_ia32_pmovzxbq128">,
711 Intrinsic<[llvm_v2i64_ty, llvm_v16i8_ty]>;
712 def int_x86_sse41_pmovzxbw : GCCBuiltin<"__builtin_ia32_pmovzxbw128">,
713 Intrinsic<[llvm_v8i16_ty, llvm_v16i8_ty]>;
714 def int_x86_sse41_pmovzxdq : GCCBuiltin<"__builtin_ia32_pmovzxdq128">,
715 Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty]>;
716 def int_x86_sse41_pmovzxwd : GCCBuiltin<"__builtin_ia32_pmovzxwd128">,
717 Intrinsic<[llvm_v4i32_ty, llvm_v8i16_ty]>;
718 def int_x86_sse41_pmovzxwq : GCCBuiltin<"__builtin_ia32_pmovzxwq128">,
719 Intrinsic<[llvm_v2i64_ty, llvm_v8i16_ty]>;
720 }
721
722 // Vector min element
723 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
724 def int_x86_sse41_phminposuw : GCCBuiltin<"__builtin_ia32_phminposuw128">,
725 Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty]>;
726 }
727
728 // Vector compare, min, max
729 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
730 def int_x86_sse41_pcmpeqq : GCCBuiltin<"__builtin_ia32_pcmpeqq">,
731 Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty]>;
732 def int_x86_sse41_pmaxsb : GCCBuiltin<"__builtin_ia32_pmaxsb128">,
733 Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
734 def int_x86_sse41_pmaxsd : GCCBuiltin<"__builtin_ia32_pmaxsd128">,
735 Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
736 def int_x86_sse41_pmaxud : GCCBuiltin<"__builtin_ia32_pmaxud128">,
737 Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
738 def int_x86_sse41_pmaxuw : GCCBuiltin<"__builtin_ia32_pmaxuw128">,
739 Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty]>;
740 def int_x86_sse41_pminsb : GCCBuiltin<"__builtin_ia32_pminsb128">,
741 Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
742 def int_x86_sse41_pminsd : GCCBuiltin<"__builtin_ia32_pminsd128">,
743 Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
744 def int_x86_sse41_pminud : GCCBuiltin<"__builtin_ia32_pminud128">,
745 Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
746 def int_x86_sse41_pminuw : GCCBuiltin<"__builtin_ia32_pminuw128">,
747 Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty]>;
748 }
749
750 // Vector pack
751 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
752 def int_x86_sse41_packusdw : GCCBuiltin<"__builtin_ia32_packusdw128">,
753 Intrinsic<[llvm_v8i16_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
754 }
755
756 // Vector multiply
757 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
758 def int_x86_sse41_pmuldq : GCCBuiltin<"__builtin_ia32_pmuldq128">,
759 Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
760 def int_x86_sse41_pmulld : GCCBuiltin<"__builtin_ia32_pmulld128">,
761 Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
762 }
763
764 // Vector extract
765 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
766 def int_x86_sse41_pextrb : GCCBuiltin<"__builtin_ia32_vec_ext_v16qi">,
767 Intrinsic<[llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty]>;
768 def int_x86_sse41_pextrd : GCCBuiltin<"__builtin_ia32_vec_ext_v4si">,
769 Intrinsic<[llvm_i32_ty, llvm_v4i32_ty, llvm_i32_ty]>;
770 def int_x86_sse41_pextrq : GCCBuiltin<"__builtin_ia32_vec_ext_v2di">,
771 Intrinsic<[llvm_i64_ty, llvm_v2i64_ty, llvm_i32_ty]>;
772 def int_x86_sse41_extractps : GCCBuiltin<"__builtin_ia32_extractps128">,
773 Intrinsic<[llvm_i32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
774 }
775
776 // Vector insert
777 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
778 def int_x86_sse41_pinsrb : GCCBuiltin<"__builtin_ia32_vec_set_v16qi">,
779 Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty]>;
780 def int_x86_sse41_pinsrd : GCCBuiltin<"__builtin_ia32_vec_set_v4si">,
781 Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty]>;
782 def int_x86_sse41_pinsrq : GCCBuiltin<"__builtin_ia32_vec_set_v2di">,
783 Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, llvm_i64_ty, llvm_i32_ty]>;
784 def int_x86_sse41_insertps : GCCBuiltin<"__builtin_ia32_insertps128">,
785 Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
786 }
787
788 // Vector blend
789 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
790 def int_x86_sse41_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb128">,
791 Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
792 def int_x86_sse41_pblendw : GCCBuiltin<"__builtin_ia32_pblendw128">,
793 Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty]>;
794 def int_x86_sse41_blendpd : GCCBuiltin<"__builtin_ia32_blendpd">,
795 Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty]>;
796 def int_x86_sse41_blendps : GCCBuiltin<"__builtin_ia32_blendps">,
797 Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
798 def int_x86_sse41_blendvpd : GCCBuiltin<"__builtin_ia32_blendvpd">,
799 Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty]>;
800 def int_x86_sse41_blendvps : GCCBuiltin<"__builtin_ia32_blendvps">,
801 Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty]>;
802 }
803
804 // Vector dot product
805 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
806 def int_x86_sse41_dppd : GCCBuiltin<"__builtin_ia32_dppd">,
807 Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty]>;
808 def int_x86_sse41_dpps : GCCBuiltin<"__builtin_ia32_dpps">,
809 Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
810 }
811
812 // Vector sum of absolute differences
813 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
814 def int_x86_sse41_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
815 Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty]>;
816 }
817
818 // Vector sum of absolute differences
819 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
820 def int_x86_sse41_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa">,
821 Intrinsic<[llvm_v2i64_ty, llvm_ptr_ty]>;
822 }
823
824
825 //===----------------------------------------------------------------------===//
676826 // MMX
677827
678828 // Empty MMX state op.
3333 def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
3434 "Enable SSSE3 instructions",
3535 [FeatureSSE3]>;
36 def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
37 "Enable SSE 4.1 instructions",
38 [FeatureSSSE3]>;
39 def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
40 "Enable SSE 4.2 instructions",
41 [FeatureSSE41]>;
3642 def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
3743 "Enable 3DNow! instructions">;
3844 def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
6571 def : Proc<"prescott", [FeatureSSE3]>;
6672 def : Proc<"nocona", [FeatureSSE3]>;
6773 def : Proc<"core2", [FeatureSSSE3]>;
74 def : Proc<"penryn", [FeatureSSE41]>;
6875
6976 def : Proc<"k6", [FeatureMMX]>;
7077 def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>;
12651265 def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
12661266 "mov{d|q}\t{$src, $dst|$dst, $src}",
12671267 [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
1268
1269 //===----------------------------------------------------------------------===//
1270 // X86-64 SSE4.1 Instructions
1271 //===----------------------------------------------------------------------===//
1272
1273 // PEXTRB, unary, TA, 0x14, REX.W
1274 // PEXTRW, unary, TA, 0x15, REX.W
1275 // PEXTRQ, unary, TA, 0x16, REX.W
1276 // EXTRACTPS, unary, TA, 0x17, REX.W
1277 // PINSRQ, 2addr, binary, TA, 0x22, REX.W
165165 def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
166166 def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
167167 def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
168 def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
169 def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
168170 def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
169171 def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
170172 def In32BitMode : Predicate<"!Subtarget->is64Bit()">;
30373037 (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
30383038 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
30393039 (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
3040
3041 //===----------------------------------------------------------------------===//
3042 // SSE4.1 Instructions
3043 //===----------------------------------------------------------------------===//
3044
3045 // SSE4.1 Instruction Templates:
3046 //
3047 // SS418I - SSE 4.1 instructions with T8 prefix.
3048 // SS41AI - SSE 4.1 instructions with TA prefix.
3049 //
3050 class SS418I o, Format F, dag outs, dag ins, string asm,
3051 list pattern>
3052 : I, T8, Requires<[HasSSE41]>;
3053 class SS41AI o, Format F, dag outs, dag ins, string asm,
3054 list pattern>
3055 : I, TA, Requires<[HasSSE41]>;
3056
3057
3058 multiclass sse41_fp_unop_rm opcss, bits<8> opcps,
3059 bits<8> opcsd, bits<8> opcpd,
3060 string OpcodeStr,
3061 Intrinsic F32Int,
3062 Intrinsic V4F32Int,
3063 Intrinsic F64Int,
3064 Intrinsic V2F64Int,
3065 bit Commutable = 0> {
3066 // Intrinsic operation, reg.
3067 def SSr_Int : SS41AI
3068 (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
3069 !strconcat(OpcodeStr,
3070 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3071 [(set VR128:$dst, (F32Int VR128:$src1, imm:$src2))]> {
3072 let isCommutable = Commutable;
3073 }
3074
3075 // Intrinsic operation, mem.
3076 def SSm_Int : SS41AI
3077 (outs VR128:$dst), (ins ssmem:$src1, i32imm:$src2),
3078 !strconcat(OpcodeStr,
3079 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3080 [(set VR128:$dst, (F32Int sse_load_f32:$src1, imm:$src2))]>;
3081
3082 // Vector intrinsic operation, reg
3083 def PSr_Int : SS41AI
3084 (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
3085 !strconcat(OpcodeStr,
3086 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3087 [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]> {
3088 let isCommutable = Commutable;
3089 }
3090
3091 // Vector intrinsic operation, mem
3092 def PSm_Int : SS41AI
3093 (outs VR128:$dst), (ins f128mem:$src1, i32imm:$src2),
3094 !strconcat(OpcodeStr,
3095 "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3096 [(set VR128:$dst, (V4F32Int (load addr:$src1), imm:$src2))]>;
3097
3098 // Intrinsic operation, reg.
3099 def SDr_Int : SS41AI
3100 (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
3101 !strconcat(OpcodeStr,
3102 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3103 [(set VR128:$dst, (F64Int VR128:$src1, imm:$src2))]> {
3104 let isCommutable = Commutable;
3105 }
3106
3107 // Intrinsic operation, mem.
3108 def SDm_Int : SS41AI
3109 (outs VR128:$dst), (ins sdmem:$src1, i32imm:$src2),
3110 !strconcat(OpcodeStr,
3111 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3112 [(set VR128:$dst, (F64Int sse_load_f64:$src1, imm:$src2))]>;
3113
3114 // Vector intrinsic operation, reg
3115 def PDr_Int : SS41AI
3116 (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
3117 !strconcat(OpcodeStr,
3118 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3119 [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]> {
3120 let isCommutable = Commutable;
3121 }
3122
3123 // Vector intrinsic operation, mem
3124 def PDm_Int : SS41AI
3125 (outs VR128:$dst), (ins f128mem:$src1, i32imm:$src2),
3126 !strconcat(OpcodeStr,
3127 "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3128 [(set VR128:$dst, (V2F64Int (load addr:$src1), imm:$src2))]>;
3129 }
3130
3131 // FP round - roundss, roundps, roundsd, roundpd
3132 defm ROUND : sse41_fp_unop_rm<0x0A, 0x08, 0x0B, 0x09, "round",
3133 int_x86_sse41_round_ss, int_x86_sse41_round_ps,
3134 int_x86_sse41_round_sd, int_x86_sse41_round_pd>;
113113 if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
114114 if (ECX & 0x1) X86SSELevel = SSE3;
115115 if ((ECX >> 9) & 0x1) X86SSELevel = SSSE3;
116 if ((ECX >> 19) & 0x1) X86SSELevel = SSE41;
117 if ((ECX >> 20) & 0x1) X86SSELevel = SSE42;
116118
117119 if (memcmp(text.c, "GenuineIntel", 12) == 0 ||
118120 memcmp(text.c, "AuthenticAMD", 12) == 0) {
3737 };
3838 protected:
3939 enum X86SSEEnum {
40 NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3
40 NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
4141 };
4242
4343 enum X863DNowEnum {
126126 bool hasSSE2() const { return X86SSELevel >= SSE2; }
127127 bool hasSSE3() const { return X86SSELevel >= SSE3; }
128128 bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
129 bool hasSSE41() const { return X86SSELevel >= SSE41; }
130 bool hasSSE42() const { return X86SSELevel >= SSE42; }
129131 bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
130132 bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
131133