llvm.org GIT mirror llvm / 0e258a0
Enable AVX512_BF16 instructions, which are supported for BFLOAT16 in Cooper Lake Summary: 1. Enable infrastructure of AVX512_BF16, which is supported for BFLOAT16 in Cooper Lake; 2. Enable VCVTNE2PS2BF16, VCVTNEPS2BF16 and DPBF16PS instructions, which are Vector Neural Network Instructions supporting BFLOAT16 inputs and conversion instructions from IEEE single precision. VCVTNE2PS2BF16: Convert Two Packed Single Data to One Packed BF16 Data. VCVTNEPS2BF16: Convert Packed Single Data to Packed BF16 Data. VDPBF16PS: Dot Product of BF16 Pairs Accumulated into Packed Single Precision. For more details about BF16 isa, please refer to the latest ISE document: https://software.intel.com/en-us/download/intel-architecture-instruction-set-extensions-programming-reference Author: LiuTianle Reviewers: craig.topper, smaslov, LuoYuanke, wxiao3, annita.zhang, RKSimon, spatel Reviewed By: craig.topper Subscribers: kristina, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D60550 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@360017 91177308-0d34-0410-b5e6-96231b3b80d8 Luo, Yuanke 1 year, 5 months ago
10 changed file(s) with 251 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
48334833 def int_x86_invpcid : GCCBuiltin<"__builtin_ia32_invpcid">,
48344834 Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>;
48354835 }
4836
4837 let TargetPrefix = "x86" in {
4838 def int_x86_avx512bf16_cvtne2ps2bf16_128:
4839 GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_128">,
4840 Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
4841 [IntrNoMem]>;
4842 def int_x86_avx512bf16_cvtne2ps2bf16_256:
4843 GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_256">,
4844 Intrinsic<[llvm_v16i16_ty], [llvm_v8f32_ty, llvm_v8f32_ty],
4845 [IntrNoMem]>;
4846 def int_x86_avx512bf16_cvtne2ps2bf16_512:
4847 GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_512">,
4848 Intrinsic<[llvm_v32i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty],
4849 [IntrNoMem]>;
4850 // Intrinsic must be masked due to it producing less than 128 bits of results.
4851 def int_x86_avx512bf16_mask_cvtneps2bf16_128:
4852 Intrinsic<[llvm_v8i16_ty],
4853 [llvm_v4f32_ty, llvm_v8i16_ty, llvm_v4i1_ty],
4854 [IntrNoMem]>;
4855 def int_x86_avx512bf16_cvtneps2bf16_256:
4856 GCCBuiltin<"__builtin_ia32_cvtneps2bf16_256">,
4857 Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty], [IntrNoMem]>;
4858 def int_x86_avx512bf16_cvtneps2bf16_512:
4859 GCCBuiltin<"__builtin_ia32_cvtneps2bf16_512">,
4860 Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty], [IntrNoMem]>;
4861 def int_x86_avx512bf16_dpbf16ps_128:
4862 GCCBuiltin<"__builtin_ia32_dpbf16ps_128">,
4863 Intrinsic<[llvm_v4f32_ty],
4864 [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
4865 def int_x86_avx512bf16_dpbf16ps_256:
4866 GCCBuiltin<"__builtin_ia32_dpbf16ps_256">,
4867 Intrinsic<[llvm_v8f32_ty],
4868 [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>;
4869 def int_x86_avx512bf16_dpbf16ps_512:
4870 GCCBuiltin<"__builtin_ia32_dpbf16ps_512">,
4871 Intrinsic<[llvm_v16f32_ty],
4872 [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
4873 }
13741374 // detecting features using the "-march=native" flag.
13751375 // For more info, see X86 ISA docs.
13761376 Features["pconfig"] = HasLeaf7 && ((EDX >> 18) & 1);
1377 bool HasLeaf7Subleaf1 =
1378 MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
1379 Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
13771380
13781381 bool HasLeafD = MaxLevel >= 0xd &&
13791382 !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
166166 def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
167167 "Enable AVX-512 Vector Neural Network Instructions",
168168 [FeatureAVX512]>;
169 def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
170 "Support bfloat16 floating point",
171 [FeatureBWI]>;
169172 def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
170173 "Enable AVX-512 Bit Algorithms",
171174 [FeatureBWI]>;
2262222622 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
2262322623 PassThru, Mask);
2262422624
22625 }
22626 case CVTNEPS2BF16_MASK: {
22627 SDValue Src = Op.getOperand(1);
22628 SDValue PassThru = Op.getOperand(2);
22629 SDValue Mask = Op.getOperand(3);
22630
22631 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
22632 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
22633
22634 // Break false dependency.
22635 if (PassThru.isUndef())
22636 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
22637
22638 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
22639 Mask);
2262522640 }
2262622641 default:
2262722642 break;
2807228087 case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
2807328088 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
2807428089 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
28090 case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
28091 case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
28092 case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
28093 case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
2807528094 case X86ISD::LWPINS: return "X86ISD::LWPINS";
2807628095 case X86ISD::MGATHER: return "X86ISD::MGATHER";
2807728096 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
508508 MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
509509 MCVTSI2P, MCVTUI2P,
510510
511 // Vector float to bfloat16.
512 // Convert TWO packed single data to one packed BF16 data
513 CVTNE2PS2BF16,
514 // Convert packed single data to packed BF16 data
515 CVTNEPS2BF16,
516 // Masked version of above.
517 // SRC, PASSTHRU, MASK
518 MCVTNEPS2BF16,
519
520 // Dot product of BF16 pairs to accumulated into
521 // packed single precision.
522 DPBF16PS,
523
511524 // Save xmm argument registers to the stack, according to %al. An operator
512525 // is needed so that this can be expanded with control flow.
513526 VASTART_SAVE_XMM_REGS,
1264612646 Sched<[SchedWriteFMA.ZMM.Folded]>;
1264712647 }
1264812648
12649 multiclass avx512_binop_all2 opc, string OpcodeStr,
12650 X86SchedWriteWidths sched,
12651 AVX512VLVectorVTInfo _SrcVTInfo,
12652 AVX512VLVectorVTInfo _DstVTInfo,
12653 SDNode OpNode, Predicate prd,
12654 bit IsCommutable = 0> {
12655 let Predicates = [prd] in
12656 defm NAME#Z : avx512_binop_rm2
12657 _SrcVTInfo.info512, _DstVTInfo.info512,
12658 _SrcVTInfo.info512, IsCommutable>,
12659 EVEX_V512, EVEX_CD8<32, CD8VF>;
12660 let Predicates = [HasVLX, prd] in {
12661 defm NAME#Z256 : avx512_binop_rm2
12662 _SrcVTInfo.info256, _DstVTInfo.info256,
12663 _SrcVTInfo.info256, IsCommutable>,
12664 EVEX_V256, EVEX_CD8<32, CD8VF>;
12665 defm NAME#Z128 : avx512_binop_rm2
12666 _SrcVTInfo.info128, _DstVTInfo.info128,
12667 _SrcVTInfo.info128, IsCommutable>,
12668 EVEX_V128, EVEX_CD8<32, CD8VF>;
12669 }
12670 }
12671
12672 defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
12673 SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF
12674 avx512vl_f32_info, avx512vl_i16_info,
12675 X86cvtne2ps2bf16, HasBF16, 0>, T8XD;
12676
12677 // Truncate Float to BFloat16
12678 multiclass avx512_cvtps2bf16 opc, string OpcodeStr,
12679 X86SchedWriteWidths sched> {
12680 let Predicates = [HasBF16] in {
12681 defm Z : avx512_vcvt_fp
12682 X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
12683 }
12684 let Predicates = [HasBF16, HasVLX] in {
12685 defm Z128 : avx512_vcvt_fp
12686 null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
12687 VK4WM>, EVEX_V128;
12688 defm Z256 : avx512_vcvt_fp
12689 X86cvtneps2bf16,
12690 sched.YMM, "{1to8}", "{y}">, EVEX_V256;
12691
12692 def : InstAlias
12693 (!cast(NAME # "Z128rr") VR128X:$dst,
12694 VR128X:$src), 0>;
12695 def : InstAlias
12696 (!cast(NAME # "Z128rm") VR128X:$dst,
12697 f128mem:$src), 0, "intel">;
12698 def : InstAlias
12699 (!cast(NAME # "Z256rr") VR128X:$dst,
12700 VR256X:$src), 0>;
12701 def : InstAlias
12702 (!cast(NAME # "Z256rm") VR128X:$dst,
12703 f256mem:$src), 0, "intel">;
12704 }
12705 }
12706
12707 defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16",
12708 SchedWriteCvtPD2PS>, T8XS,
12709 EVEX_CD8<32, CD8VF>;
12710
12711 let Predicates = [HasBF16, HasVLX] in {
12712 // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction
12713 // patterns have been disabled with null_frag.
12714 def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 VR128X:$src))),
12715 (VCVTNEPS2BF16Z128rr VR128X:$src)>;
12716 def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8i16 VR128X:$src0),
12717 VK4WM:$mask),
12718 (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>;
12719 def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8i16x_info.ImmAllZerosV,
12720 VK4WM:$mask),
12721 (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>;
12722
12723 def : Pat<(v8i16 (X86cvtneps2bf16 (loadv4f32 addr:$src))),
12724 (VCVTNEPS2BF16Z128rm addr:$src)>;
12725 def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8i16 VR128X:$src0),
12726 VK4WM:$mask),
12727 (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
12728 def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8i16x_info.ImmAllZerosV,
12729 VK4WM:$mask),
12730 (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>;
12731
12732 def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32
12733 (X86VBroadcast (loadf32 addr:$src))))),
12734 (VCVTNEPS2BF16Z128rmb addr:$src)>;
12735 def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
12736 (v8i16 VR128X:$src0), VK4WM:$mask),
12737 (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
12738 def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))),
12739 v8i16x_info.ImmAllZerosV, VK4WM:$mask),
12740 (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>;
12741 }
12742
12743 let Constraints = "$src1 = $dst" in {
12744 multiclass avx512_dpbf16ps_rm opc, string OpcodeStr, SDNode OpNode,
12745 X86VectorVTInfo _, X86VectorVTInfo src_v> {
12746 defm r: AVX512_maskable_3src
12747 (ins _.RC:$src2, _.RC:$src3),
12748 OpcodeStr, "$src3, $src2", "$src2, $src3",
12749 (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
12750 EVEX_4V;
12751
12752 defm m: AVX512_maskable_3src
12753 (ins _.RC:$src2, _.MemOp:$src3),
12754 OpcodeStr, "$src3, $src2", "$src2, $src3",
12755 (_.VT (OpNode _.RC:$src1, _.RC:$src2,
12756 (src_v.VT (bitconvert
12757 (src_v.LdFrag addr:$src3)))))>, EVEX_4V;
12758
12759 defm mb: AVX512_maskable_3src
12760 (ins _.RC:$src2, _.ScalarMemOp:$src3),
12761 OpcodeStr,
12762 !strconcat("${src3}", _.BroadcastStr,", $src2"),
12763 !strconcat("$src2, ${src3}", _.BroadcastStr),
12764 (_.VT (OpNode _.RC:$src1, _.RC:$src2,
12765 (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>,
12766 EVEX_B, EVEX_4V;
12767
12768 }
12769 } // Constraints = "$src1 = $dst"
12770
12771 multiclass avx512_dpbf16ps_sizes opc, string OpcodeStr, SDNode OpNode,
12772 AVX512VLVectorVTInfo _,
12773 AVX512VLVectorVTInfo src_v, Predicate prd> {
12774 let Predicates = [prd] in {
12775 defm Z : avx512_dpbf16ps_rm
12776 src_v.info512>, EVEX_V512;
12777 }
12778 let Predicates = [HasVLX, prd] in {
12779 defm Z256 : avx512_dpbf16ps_rm
12780 src_v.info256>, EVEX_V256;
12781 defm Z128 : avx512_dpbf16ps_rm
12782 src_v.info128>, EVEX_V128;
12783 }
12784 }
12785
12786 defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps,
12787 avx512vl_f32_info, avx512vl_i32_info,
12788 HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
663663 SDTCisOpSmallerThanOp<0, 1>,
664664 SDTCisVT<2, i32>]>>;
665665
666 // cvt fp to bfloat16
667 def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16",
668 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
669 SDTCisSameAs<1,2>]>>;
670 def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16",
671 SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
672 SDTCVecEltisVT<1, f32>,
673 SDTCisSameAs<0, 2>,
674 SDTCVecEltisVT<3, i1>,
675 SDTCisSameNumEltsAs<1, 3>]>>;
676 def X86cvtneps2bf16 : SDNode<"X86ISD::CVTNEPS2BF16",
677 SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i16>,
678 SDTCVecEltisVT<1, f32>]>>;
679 def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS",
680 SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
681 SDTCisSameAs<0,1>,
682 SDTCVecEltisVT<2, i32>,
683 SDTCisSameAs<2,3>]>>;
684
666685 // galois field arithmetic
667686 def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
668687 def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
834834 def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
835835 def PKU : Predicate<"Subtarget->hasPKU()">;
836836 def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
837 def HasBF16 : Predicate<"Subtarget->hasBF16()">;
837838
838839 def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
839840 def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
1818 namespace llvm {
1919
2020 enum IntrinsicType : uint16_t {
21 CVTNEPS2BF16_MASK,
2122 GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS,
2223 INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
2324 INTR_TYPE_3OP_IMM8,
980981 X86_INTRINSIC_DATA(avx512_vpshufbitqmb_128, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
981982 X86_INTRINSIC_DATA(avx512_vpshufbitqmb_256, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
982983 X86_INTRINSIC_DATA(avx512_vpshufbitqmb_512, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0),
984 // bfloat16
985 X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_128, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
986 X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_256, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
987 X86_INTRINSIC_DATA(avx512bf16_cvtne2ps2bf16_512, INTR_TYPE_2OP, X86ISD::CVTNE2PS2BF16, 0),
988 X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_256, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0),
989 X86_INTRINSIC_DATA(avx512bf16_cvtneps2bf16_512, INTR_TYPE_1OP, X86ISD::CVTNEPS2BF16, 0),
990 X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_128, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
991 X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
992 X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
993 X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16),
983994 X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
984995 X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
985996 X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
351351
352352 /// Processor has AVX-512 Vector Neural Network Instructions
353353 bool HasVNNI = false;
354
355 /// Processor has AVX-512 bfloat16 floating-point extensions
356 bool HasBF16 = false;
354357
355358 /// Processor has AVX-512 Bit Algorithms instructions
356359 bool HasBITALG = false;
667670 bool hasVLX() const { return HasVLX; }
668671 bool hasPKU() const { return HasPKU; }
669672 bool hasVNNI() const { return HasVNNI; }
673 bool hasBF16() const { return HasBF16; }
670674 bool hasBITALG() const { return HasBITALG; }
671675 bool hasMPX() const { return HasMPX; }
672676 bool hasSHSTK() const { return HasSHSTK; }