llvm.org GIT mirror llvm / 1f7a1b6
X86: add GATHER intrinsics (AVX2) in LLVM Support the following intrinsics: llvm.x86.avx2.gather.d.pd, llvm.x86.avx2.gather.q.pd llvm.x86.avx2.gather.d.pd.256, llvm.x86.avx2.gather.q.pd.256 llvm.x86.avx2.gather.d.ps, llvm.x86.avx2.gather.q.ps llvm.x86.avx2.gather.d.ps.256, llvm.x86.avx2.gather.q.ps.256 Modified Disassembler to handle VSIB addressing mode. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@159221 91177308-0d34-0410-b5e6-96231b3b80d8 Manman Ren 7 years ago
13 changed file(s) with 268 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
17431743 [IntrNoMem]>;
17441744 }
17451745
1746 // Gather ops
1747 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
1748 def int_x86_avx2_gather_d_pd : GCCBuiltin<"__builtin_ia32_gatherd_pd">,
1749 Intrinsic<[llvm_v2f64_ty],
1750 [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i8_ty],
1751 [IntrReadMem]>;
1752 def int_x86_avx2_gather_d_pd_256 : GCCBuiltin<"__builtin_ia32_gatherd_pd256">,
1753 Intrinsic<[llvm_v4f64_ty],
1754 [llvm_v4f64_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v4f64_ty, llvm_i8_ty],
1755 [IntrReadMem]>;
1756 def int_x86_avx2_gather_q_pd : GCCBuiltin<"__builtin_ia32_gatherq_pd">,
1757 Intrinsic<[llvm_v2f64_ty],
1758 [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
1759 [IntrReadMem]>;
1760 def int_x86_avx2_gather_q_pd_256 : GCCBuiltin<"__builtin_ia32_gatherq_pd256">,
1761 Intrinsic<[llvm_v4f64_ty],
1762 [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
1763 [IntrReadMem]>;
1764 def int_x86_avx2_gather_d_ps : GCCBuiltin<"__builtin_ia32_gatherd_ps">,
1765 Intrinsic<[llvm_v4f32_ty],
1766 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
1767 [IntrReadMem]>;
1768 def int_x86_avx2_gather_d_ps_256 : GCCBuiltin<"__builtin_ia32_gatherd_ps256">,
1769 Intrinsic<[llvm_v8f32_ty],
1770 [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
1771 [IntrReadMem]>;
1772 def int_x86_avx2_gather_q_ps : GCCBuiltin<"__builtin_ia32_gatherq_ps">,
1773 Intrinsic<[llvm_v4f32_ty],
1774 [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty],
1775 [IntrReadMem]>;
1776 def int_x86_avx2_gather_q_ps_256 : GCCBuiltin<"__builtin_ia32_gatherq_ps256">,
1777 Intrinsic<[llvm_v8f32_ty],
1778 [llvm_v8f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v8f32_ty, llvm_i8_ty],
1779 [IntrReadMem]>;
1780 }
1781
17461782 // Misc.
17471783 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
17481784 def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">,
915915
916916 // If we have both a base register and an index register make sure they are
917917 // both 64-bit or 32-bit registers.
918 // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
918919 if (BaseReg != 0 && IndexReg != 0) {
919920 if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
920 !X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) &&
921 (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
922 X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
921923 IndexReg != X86::RIZ) {
922924 Error(IndexLoc, "index register is 32-bit, but base register is 64-bit");
923925 return 0;
924926 }
925927 if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
926 !X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) &&
928 (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
929 X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
927930 IndexReg != X86::EIZ){
928931 Error(IndexLoc, "index register is 64-bit, but base register is 32-bit");
929932 return 0;
497497 } else {
498498 baseReg = MCOperand::CreateReg(0);
499499 }
500
500
501 // Check whether we are handling VSIB addressing mode for GATHER.
502 // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
503 // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
504 // I don't see a way to get the correct IndexReg in readSIB:
505 // We can tell whether it is VSIB or SIB after instruction ID is decoded,
506 // but instruction ID may not be decoded yet when calling readSIB.
507 uint32_t Opcode = mcInst.getOpcode();
508 bool IsGather = (Opcode == X86::VGATHERDPDrm ||
509 Opcode == X86::VGATHERQPDrm ||
510 Opcode == X86::VGATHERDPSrm ||
511 Opcode == X86::VGATHERQPSrm);
512 bool IsGatherY = (Opcode == X86::VGATHERDPDYrm ||
513 Opcode == X86::VGATHERQPDYrm ||
514 Opcode == X86::VGATHERDPSYrm ||
515 Opcode == X86::VGATHERQPSYrm);
516 if (IsGather || IsGatherY) {
517 unsigned IndexOffset = insn.sibIndex -
518 (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
519 SIBIndex IndexBase = IsGatherY ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
520 insn.sibIndex = (SIBIndex)(IndexBase +
521 (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
522 }
523
501524 if (insn.sibIndex != SIB_INDEX_NONE) {
502525 switch (insn.sibIndex) {
503526 default:
508531 indexReg = MCOperand::CreateReg(X86::x); break;
509532 EA_BASES_32BIT
510533 EA_BASES_64BIT
534 REGS_XMM
535 REGS_YMM
511536 #undef ENTRY
512537 }
513538 } else {
309309 * SIBIndex - All possible values of the SIB index field.
310310 * Borrows entries from ALL_EA_BASES with the special case that
311311 * sib is synonymous with NONE.
312 * Vector SIB: index can be XMM or YMM.
312313 */
313314 typedef enum {
314315 SIB_INDEX_NONE,
315316 #define ENTRY(x) SIB_INDEX_##x,
316317 ALL_EA_BASES
318 REGS_XMM
319 REGS_YMM
317320 #undef ENTRY
318321 SIB_INDEX_max
319322 } SIBIndex;
620620 VEX_X = 0x0;
621621
622622 if (HasVEX_4VOp3)
623 VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
623 // Instruction format for 4VOp3:
624 // src1(ModR/M), MemAddr, src3(VEX_4V)
625 // CurOp points to start of the MemoryOperand,
626 // it skips TIED_TO operands if exist, then increments past src1.
627 // CurOp + X86::AddrNumOperands will point to src3.
628 VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
624629 break;
625630 case X86II::MRM0m: case X86II::MRM1m:
626631 case X86II::MRM2m: case X86II::MRM3m:
186186
187187 private:
188188 SDNode *Select(SDNode *N);
189 SDNode *SelectGather(SDNode *N, unsigned Opc);
189190 SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
190191 SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
191192 SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
19511952 llvm_unreachable("unrecognized size for LdVT");
19521953 }
19531954
1955 /// SelectGather - Customized ISel for GATHER operations.
1956 ///
1957 SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
1958 // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
1959 SDValue Chain = Node->getOperand(0);
1960 SDValue VSrc = Node->getOperand(2);
1961 SDValue Base = Node->getOperand(3);
1962 SDValue VIdx = Node->getOperand(4);
1963 SDValue VMask = Node->getOperand(5);
1964 ConstantSDNode *Scale = dyn_cast(Node->getOperand(6));
1965 assert(Scale && "Scale should be a constant for GATHER operations");
1966
1967 // Memory Operands: Base, Scale, Index, Disp, Segment
1968 SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
1969 SDValue Segment = CurDAG->getRegister(0, MVT::i32);
1970 const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
1971 Disp, Segment, VMask, Chain};
1972 SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
1973 VSrc.getValueType(), MVT::Other,
1974 Ops, array_lengthof(Ops));
1975 return ResNode;
1976 }
1977
19541978 SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
19551979 EVT NVT = Node->getValueType(0);
19561980 unsigned Opc, MOpc;
19661990
19671991 switch (Opcode) {
19681992 default: break;
1993 case ISD::INTRINSIC_W_CHAIN: {
1994 unsigned IntNo = cast(Node->getOperand(1))->getZExtValue();
1995 switch (IntNo) {
1996 default: break;
1997 case Intrinsic::x86_avx2_gather_d_pd:
1998 return SelectGather(Node, X86::VGATHERDPDrm);
1999 case Intrinsic::x86_avx2_gather_d_pd_256:
2000 return SelectGather(Node, X86::VGATHERDPDYrm);
2001 case Intrinsic::x86_avx2_gather_q_pd:
2002 return SelectGather(Node, X86::VGATHERQPDrm);
2003 case Intrinsic::x86_avx2_gather_q_pd_256:
2004 return SelectGather(Node, X86::VGATHERQPDYrm);
2005 case Intrinsic::x86_avx2_gather_d_ps:
2006 return SelectGather(Node, X86::VGATHERDPSrm);
2007 case Intrinsic::x86_avx2_gather_d_ps_256:
2008 return SelectGather(Node, X86::VGATHERDPSYrm);
2009 case Intrinsic::x86_avx2_gather_q_ps:
2010 return SelectGather(Node, X86::VGATHERQPSrm);
2011 case Intrinsic::x86_avx2_gather_q_ps_256:
2012 return SelectGather(Node, X86::VGATHERQPSYrm);
2013 }
2014 break;
2015 }
19692016 case X86ISD::GlobalBaseReg:
19702017 return getGlobalBaseReg();
19712018
323323 def f128mem : X86MemOperand<"printf128mem"> {
324324 let ParserMatchClass = X86Mem128AsmOperand; }
325325 def f256mem : X86MemOperand<"printf256mem">{
326 let ParserMatchClass = X86Mem256AsmOperand; }
327 def v128mem : Operand {
328 let PrintMethod = "printf128mem";
329 let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
330 let ParserMatchClass = X86Mem128AsmOperand; }
331 def v256mem : Operand {
332 let PrintMethod = "printf256mem";
333 let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
326334 let ParserMatchClass = X86Mem256AsmOperand; }
327335 }
328336
79937993 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
79947994 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
79957995 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
7996
7997 //===----------------------------------------------------------------------===//
7998 // VGATHER - GATHER Operations
7999 //
8000 // [(set VR128:$dst, (IntGather128 VR128:$src1, addr:$src2, VR128:$idx,
8001 // VR128:$mask, (i8 imm:$sc)))]>, VEX_4VOp3;
8002 // [(set VR256:$dst, (IntGather256 VR256:$src1, addr:$src2, VR256:$idx,
8003 // VR256:$mask, (i8 imm:$sc)))]>, VEX_4VOp3;
8004 multiclass avx2_gather opc, string OpcodeStr,
8005 Intrinsic IntGather128, Intrinsic IntGather256> {
8006 def rm : AVX28I
8007 (ins VR128:$src1, v128mem:$src2, VR128:$mask),
8008 !strconcat(OpcodeStr,
8009 "\t{$src1, $src2, $mask|$mask, $src2, $src1}"),
8010 []>, VEX_4VOp3;
8011 def Yrm : AVX28I
8012 (ins VR256:$src1, v256mem:$src2, VR256:$mask),
8013 !strconcat(OpcodeStr,
8014 "\t{$src1, $src2, $mask|$mask, $src2, $src1}"),
8015 []>, VEX_4VOp3;
8016 }
8017
8018 //let Constraints = "$src1 = $dst, $mask = $mask_wb" in {
8019 let Constraints = "$src1 = $dst" in {
8020 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
8021 int_x86_avx2_gather_d_pd,
8022 int_x86_avx2_gather_d_pd_256>, VEX_W;
8023 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
8024 int_x86_avx2_gather_q_pd,
8025 int_x86_avx2_gather_q_pd_256>, VEX_W;
8026 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
8027 int_x86_avx2_gather_d_ps,
8028 int_x86_avx2_gather_d_ps_256>;
8029 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
8030 int_x86_avx2_gather_q_ps,
8031 int_x86_avx2_gather_q_ps_256>;
8032 }
975975 ret void
976976 }
977977 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
978
979 define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
980 <4 x i32> %idx, <2 x double> %mask) {
981 ; CHECK: vgatherdpd
982 %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
983 i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
984 ret <2 x double> %res
985 }
986 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
987 <4 x i32>, <2 x double>, i8) nounwind readonly
988
989 define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
990 <8 x i32> %idx, <4 x double> %mask) {
991 ; CHECK: vgatherdpd
992 %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
993 i8* %a1, <8 x i32> %idx, <4 x double> %mask, i8 2) ;
994 ret <4 x double> %res
995 }
996 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
997 <8 x i32>, <4 x double>, i8) nounwind readonly
998
999 define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
1000 <2 x i64> %idx, <2 x double> %mask) {
1001 ; CHECK: vgatherqpd
1002 %res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
1003 i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
1004 ret <2 x double> %res
1005 }
1006 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
1007 <2 x i64>, <2 x double>, i8) nounwind readonly
1008
1009 define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
1010 <4 x i64> %idx, <4 x double> %mask) {
1011 ; CHECK: vgatherqpd
1012 %res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
1013 i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
1014 ret <4 x double> %res
1015 }
1016 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
1017 <4 x i64>, <4 x double>, i8) nounwind readonly
1018
1019 define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
1020 <4 x i32> %idx, <4 x float> %mask) {
1021 ; CHECK: vgatherdps
1022 %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
1023 i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
1024 ret <4 x float> %res
1025 }
1026 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
1027 <4 x i32>, <4 x float>, i8) nounwind readonly
1028
1029 define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
1030 <8 x i32> %idx, <8 x float> %mask) {
1031 ; CHECK: vgatherdps
1032 %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
1033 i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
1034 ret <8 x float> %res
1035 }
1036 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
1037 <8 x i32>, <8 x float>, i8) nounwind readonly
1038
1039 define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
1040 <2 x i64> %idx, <4 x float> %mask) {
1041 ; CHECK: vgatherqps
1042 %res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
1043 i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
1044 ret <4 x float> %res
1045 }
1046 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
1047 <2 x i64>, <4 x float>, i8) nounwind readonly
1048
1049 define <8 x float> @test_x86_avx2_gather_q_ps_256(<8 x float> %a0, i8* %a1,
1050 <4 x i64> %idx, <8 x float> %mask) {
1051 ; CHECK: vgatherqps
1052 %res = call <8 x float> @llvm.x86.avx2.gather.q.ps.256(<8 x float> %a0,
1053 i8* %a1, <4 x i64> %idx, <8 x float> %mask, i8 2) ;
1054 ret <8 x float> %res
1055 }
1056 declare <8 x float> @llvm.x86.avx2.gather.q.ps.256(<8 x float>, i8*,
1057 <4 x i64>, <8 x float>, i8) nounwind readonly
723723
724724 # CHECK: vpermil2ps $1, 4(%rax), %xmm2, %xmm3, %xmm0
725725 0xc4 0xe3 0xe1 0x48 0x40 0x04 0x21
726
727 # CHECK: vgatherdpd %xmm0, (%rdi,%xmm1,2), %xmm2
728 0xc4 0xe2 0xe9 0x92 0x04 0x4f
729
730 # CHECK: vgatherqps %ymm8, (%r15,%ymm9,2), %ymm10
731 0xc4 0x02 0x2d 0x93 0x04 0x4f
726732
727733 # rdar://8812056 lldb doesn't print the x86 lock prefix when disassembling
728734 # CHECK: lock
41204120 _foo2:
41214121 nop
41224122 vblendvps %ymm1, _foo2(%rip), %ymm0, %ymm0
4123
4124 // CHECK: vgatherdpd %xmm0, (%rdi,%xmm1,2), %xmm2
4125 // CHECK: encoding: [0xc4,0xe2,0xe9,0x92,0x04,0x4f]
4126 vgatherdpd %xmm0, (%rdi,%xmm1,2), %xmm2
4127
4128 // CHECK: vgatherqps %ymm8, (%r15,%ymm9,2), %ymm10
4129 // CHECK: encoding: [0xc4,0x02,0x2d,0x93,0x04,0x4f]
4130 vgatherqps %ymm8, (%r15,%ymm9,2), %ymm10
315315 MEM("i256mem");
316316 MEM("f128mem");
317317 MEM("f256mem");
318 MEM("v128mem");
319 MEM("v256mem");
318320 MEM("opaque512mem");
319321
320322 // all R, I, R, I
11051105 TYPE("VR128", TYPE_XMM128)
11061106 TYPE("f128mem", TYPE_M128)
11071107 TYPE("f256mem", TYPE_M256)
1108 TYPE("v128mem", TYPE_M128)
1109 TYPE("v256mem", TYPE_M256)
11081110 TYPE("FR64", TYPE_XMM64)
11091111 TYPE("f64mem", TYPE_M64FP)
11101112 TYPE("sdmem", TYPE_M64FP)
12341236 ENCODING("sdmem", ENCODING_RM)
12351237 ENCODING("f128mem", ENCODING_RM)
12361238 ENCODING("f256mem", ENCODING_RM)
1239 ENCODING("v128mem", ENCODING_RM)
1240 ENCODING("v256mem", ENCODING_RM)
12371241 ENCODING("f64mem", ENCODING_RM)
12381242 ENCODING("f32mem", ENCODING_RM)
12391243 ENCODING("i128mem", ENCODING_RM)