llvm.org GIT mirror llvm / 9139e91
[X86][AVX] createVariablePermute - widen permutes for cases where the source vector is wider than the destination type git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327244 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
3 changed file(s) with 119 addition(s) and 88 deletion(s). Raw diff Collapse all Expand all
79407940 MVT ShuffleVT = VT;
79417941 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
79427942 unsigned NumElts = VT.getVectorNumElements();
7943 unsigned SizeInBits = VT.getSizeInBits();
79437944
79447945 // Adjust IndicesVec to match VT size.
79457946 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
79497950 NumElts * VT.getScalarSizeInBits());
79507951 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
79517952
7952 // Adjust SrcVec to match VT type.
7953 if (SrcVec.getValueSizeInBits() > VT.getSizeInBits())
7954 return SDValue();
7955 else if (SrcVec.getValueSizeInBits() < VT.getSizeInBits())
7956 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
7953 // Handle SrcVec that don't match VT type.
7954 if (SrcVec.getValueSizeInBits() != SizeInBits) {
7955 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
7956 // Handle larger SrcVec by treating it as a larger permute.
7957 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
7958 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
7959 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
7960 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
7961 Subtarget, DAG, SDLoc(IndicesVec));
7962 return extractSubVector(
7963 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
7964 DAG, DL, SizeInBits);
7965 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
7966 // Widen smaller SrcVec to match VT.
7967 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
7968 } else
7969 return SDValue();
7970 }
79577971
79587972 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
79597973 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
565565 ; SSE41-NEXT: popq %rbp
566566 ; SSE41-NEXT: retq
567567 ;
568 ; AVX-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
569 ; AVX: # %bb.0:
570 ; AVX-NEXT: pushq %rbp
571 ; AVX-NEXT: movq %rsp, %rbp
572 ; AVX-NEXT: andq $-32, %rsp
573 ; AVX-NEXT: subq $64, %rsp
574 ; AVX-NEXT: vpextrb $0, %xmm1, %eax
575 ; AVX-NEXT: vmovaps %ymm0, (%rsp)
576 ; AVX-NEXT: andl $31, %eax
577 ; AVX-NEXT: movzbl (%rsp,%rax), %eax
578 ; AVX-NEXT: vmovd %eax, %xmm0
579 ; AVX-NEXT: vpextrb $1, %xmm1, %eax
580 ; AVX-NEXT: andl $31, %eax
581 ; AVX-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
582 ; AVX-NEXT: vpextrb $2, %xmm1, %eax
583 ; AVX-NEXT: andl $31, %eax
584 ; AVX-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
585 ; AVX-NEXT: vpextrb $3, %xmm1, %eax
586 ; AVX-NEXT: andl $31, %eax
587 ; AVX-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
588 ; AVX-NEXT: vpextrb $4, %xmm1, %eax
589 ; AVX-NEXT: andl $31, %eax
590 ; AVX-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
591 ; AVX-NEXT: vpextrb $5, %xmm1, %eax
592 ; AVX-NEXT: andl $31, %eax
593 ; AVX-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
594 ; AVX-NEXT: vpextrb $6, %xmm1, %eax
595 ; AVX-NEXT: andl $31, %eax
596 ; AVX-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
597 ; AVX-NEXT: vpextrb $7, %xmm1, %eax
598 ; AVX-NEXT: andl $31, %eax
599 ; AVX-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
600 ; AVX-NEXT: vpextrb $8, %xmm1, %eax
601 ; AVX-NEXT: andl $31, %eax
602 ; AVX-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
603 ; AVX-NEXT: vpextrb $9, %xmm1, %eax
604 ; AVX-NEXT: andl $31, %eax
605 ; AVX-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
606 ; AVX-NEXT: vpextrb $10, %xmm1, %eax
607 ; AVX-NEXT: andl $31, %eax
608 ; AVX-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
609 ; AVX-NEXT: vpextrb $11, %xmm1, %eax
610 ; AVX-NEXT: andl $31, %eax
611 ; AVX-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
612 ; AVX-NEXT: vpextrb $12, %xmm1, %eax
613 ; AVX-NEXT: andl $31, %eax
614 ; AVX-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
615 ; AVX-NEXT: vpextrb $13, %xmm1, %eax
616 ; AVX-NEXT: andl $31, %eax
617 ; AVX-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
618 ; AVX-NEXT: vpextrb $14, %xmm1, %eax
619 ; AVX-NEXT: andl $31, %eax
620 ; AVX-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
621 ; AVX-NEXT: vpextrb $15, %xmm1, %eax
622 ; AVX-NEXT: andl $31, %eax
623 ; AVX-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
624 ; AVX-NEXT: movq %rbp, %rsp
625 ; AVX-NEXT: popq %rbp
626 ; AVX-NEXT: vzeroupper
627 ; AVX-NEXT: retq
568 ; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
569 ; XOP: # %bb.0:
570 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
571 ; XOP-NEXT: vpperm %xmm1, %xmm2, %xmm0, %xmm0
572 ; XOP-NEXT: vzeroupper
573 ; XOP-NEXT: retq
574 ;
575 ; AVX1-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
576 ; AVX1: # %bb.0:
577 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
578 ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2
579 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
580 ; AVX1-NEXT: vpcmpgtb {{.*}}(%rip), %xmm1, %xmm1
581 ; AVX1-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
582 ; AVX1-NEXT: vzeroupper
583 ; AVX1-NEXT: retq
584 ;
585 ; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
586 ; AVX2: # %bb.0:
587 ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
588 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
589 ; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
590 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
591 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
592 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
593 ; AVX2-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
594 ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
595 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
596 ; AVX2-NEXT: vzeroupper
597 ; AVX2-NEXT: retq
598 ;
599 ; AVX512F-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
600 ; AVX512F: # %bb.0:
601 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
602 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
603 ; AVX512F-NEXT: vpshufb %ymm1, %ymm2, %ymm2
604 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
605 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
606 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
607 ; AVX512F-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
608 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
609 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
610 ; AVX512F-NEXT: vzeroupper
611 ; AVX512F-NEXT: retq
612 ;
613 ; AVX512VL-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
614 ; AVX512VL: # %bb.0:
615 ; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
616 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
617 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm2, %ymm2
618 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm3
619 ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
620 ; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
621 ; AVX512VL-NEXT: vpcmpgtb {{.*}}(%rip), %ymm1, %ymm1
622 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
623 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
624 ; AVX512VL-NEXT: vzeroupper
625 ; AVX512VL-NEXT: retq
626 ;
627 ; VBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
628 ; VBMI: # %bb.0:
629 ; VBMI-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
630 ; VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
631 ; VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
632 ; VBMI-NEXT: vzeroupper
633 ; VBMI-NEXT: retq
628634 %index0 = extractelement <16 x i8> %indices, i32 0
629635 %index1 = extractelement <16 x i8> %indices, i32 1
630636 %index2 = extractelement <16 x i8> %indices, i32 2
16541654 }
16551655
16561656 define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind {
1657 ; AVX-LABEL: var_shuffle_v4i32_from_v8i32:
1658 ; AVX: # %bb.0: # %entry
1659 ; AVX-NEXT: pushq %rbp
1660 ; AVX-NEXT: movq %rsp, %rbp
1661 ; AVX-NEXT: andq $-32, %rsp
1662 ; AVX-NEXT: subq $64, %rsp
1663 ; AVX-NEXT: vmovd %xmm1, %eax
1664 ; AVX-NEXT: vmovaps %ymm0, (%rsp)
1665 ; AVX-NEXT: andl $7, %eax
1666 ; AVX-NEXT: vpextrd $1, %xmm1, %ecx
1667 ; AVX-NEXT: andl $7, %ecx
1668 ; AVX-NEXT: vpextrd $2, %xmm1, %edx
1669 ; AVX-NEXT: andl $7, %edx
1670 ; AVX-NEXT: vpextrd $3, %xmm1, %esi
1671 ; AVX-NEXT: andl $7, %esi
1672 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1673 ; AVX-NEXT: vpinsrd $1, (%rsp,%rcx,4), %xmm0, %xmm0
1674 ; AVX-NEXT: vpinsrd $2, (%rsp,%rdx,4), %xmm0, %xmm0
1675 ; AVX-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm0, %xmm0
1676 ; AVX-NEXT: movq %rbp, %rsp
1677 ; AVX-NEXT: popq %rbp
1678 ; AVX-NEXT: vzeroupper
1679 ; AVX-NEXT: retq
1657 ; XOP-LABEL: var_shuffle_v4i32_from_v8i32:
1658 ; XOP: # %bb.0: # %entry
1659 ; XOP-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1660 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1661 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1662 ; XOP-NEXT: vpermil2ps $0, %ymm1, %ymm2, %ymm0, %ymm0
1663 ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1664 ; XOP-NEXT: vzeroupper
1665 ; XOP-NEXT: retq
1666 ;
1667 ; AVX1-LABEL: var_shuffle_v4i32_from_v8i32:
1668 ; AVX1: # %bb.0: # %entry
1669 ; AVX1-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1670 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3]
1671 ; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2
1672 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1673 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1674 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,3,3,3,3,3,3]
1675 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
1676 ; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm4
1677 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
1678 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
1679 ; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0
1680 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1681 ; AVX1-NEXT: vzeroupper
1682 ; AVX1-NEXT: retq
1683 ;
1684 ; INT256-LABEL: var_shuffle_v4i32_from_v8i32:
1685 ; INT256: # %bb.0: # %entry
1686 ; INT256-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
1687 ; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
1688 ; INT256-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
1689 ; INT256-NEXT: vzeroupper
1690 ; INT256-NEXT: retq
16801691 entry:
16811692 %tmp1 = extractelement <4 x i32> %indices, i32 0
16821693 %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1