llvm.org GIT mirror llvm / 84d4a7a
[X86][AVX] Enable extraction of upper 128-bit subvectors for 'half undef' shuffle lowering Added support for the extraction of the upper 128-bit subvectors for lower/upper half undef shuffles if it would reduce the number of extractions/insertions or avoid loads of AVX2 permps/permd shuffle masks. Minor follow up to D15477. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@258000 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 4 years ago
5 changed file(s) with 41 addition(s) and 28 deletion(s). Raw diff Collapse all Expand all
1042410424 SDValue V2, ArrayRef Mask,
1042510425 const X86Subtarget *Subtarget,
1042610426 SelectionDAG &DAG) {
10427 assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector");
10427 assert(VT.is256BitVector() && "Expected 256-bit vector");
1042810428
1042910429 unsigned NumElts = VT.getVectorNumElements();
1043010430 unsigned HalfNumElts = NumElts / 2;
1045410454 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
1045510455 DAG.getIntPtrConstant(HalfNumElts, DL));
1045610456 }
10457
10458 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
10459 if (UndefLower && Subtarget->hasAVX2() &&
10460 (VT == MVT::v4f64 || VT == MVT::v4i64))
10461 return SDValue();
1046210457
1046310458 // If the shuffle only uses the lower halves of the input operands,
1046410459 // then extract them and perform the 'half' shuffle at half width.
1047710472 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
1047810473 int HalfIdx = M / HalfNumElts;
1047910474
10480 // Only shuffle using the lower halves of the inputs.
10481 // TODO: Investigate usefulness of shuffling with upper halves.
10482 if (HalfIdx != 0 && HalfIdx != 2)
10483 return SDValue();
10484
1048510475 // Determine the element index into its half vector source.
1048610476 int HalfElt = M % HalfNumElts;
1048710477
1050210492 return SDValue();
1050310493 }
1050410494 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
10495
10496 // Only shuffle the halves of the inputs when useful.
10497 int NumLowerHalves =
10498 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
10499 int NumUpperHalves =
10500 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
10501
10502 // uuuuXXXX - don't extract uppers just to insert again.
10503 if (UndefLower && NumUpperHalves != 0)
10504 return SDValue();
10505
10506 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
10507 if (UndefUpper && NumUpperHalves == 2)
10508 return SDValue();
10509
10510 // AVX2 - XXXXuuuu - always extract lowers.
10511 if (Subtarget->hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
10512 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
10513 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10514 return SDValue();
10515 // AVX2 supports variable 32-bit element cross-lane shuffles.
10516 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
10517 // XXXXuuuu - don't extract lowers and uppers.
10518 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
10519 return SDValue();
10520 }
10521 }
1050510522
1050610523 auto GetHalfVector = [&](int HalfIdx) {
1050710524 if (HalfIdx < 0)
22 define void @endless_loop() {
33 ; CHECK-LABEL: endless_loop:
44 ; CHECK-NEXT: # BB#0:
5 ; CHECK-NEXT: vbroadcastss (%eax), %ymm0
5 ; CHECK-NEXT: vmovaps (%eax), %ymm0
6 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
7 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
68 ; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
79 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
810 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
55 ; CHECK-LABEL: test1:
66 ; CHECK: # BB#0:
77 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
8 ; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
8 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
99 ; CHECK-NEXT: retl
1010 %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32>
1111 ret <4 x i64>%b
11521152 ; AVX1-LABEL: shuffle_v4i64_22uu:
11531153 ; AVX1: # BB#0:
11541154 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1155 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1155 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
11561156 ; AVX1-NEXT: retq
11571157 ;
11581158 ; AVX2-LABEL: shuffle_v4i64_22uu:
896896 }
897897
898898 define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
899 ; AVX1-LABEL: shuffle_v8f32_5555uuuu:
900 ; AVX1: # BB#0:
901 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
902 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
903 ; AVX1-NEXT: retq
904 ;
905 ; AVX2-LABEL: shuffle_v8f32_5555uuuu:
906 ; AVX2: # BB#0:
907 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
908 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
909 ; AVX2-NEXT: retq
899 ; ALL-LABEL: shuffle_v8f32_5555uuuu:
900 ; ALL: # BB#0:
901 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
902 ; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
903 ; ALL-NEXT: retq
910904 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
911905 ret <8 x float> %shuffle
912906 }
20312025 ; AVX1-LABEL: shuffle_v8i32_5555uuuu:
20322026 ; AVX1: # BB#0:
20332027 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2034 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
2028 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
20352029 ; AVX1-NEXT: retq
20362030 ;
20372031 ; AVX2-LABEL: shuffle_v8i32_5555uuuu:
20382032 ; AVX2: # BB#0:
2039 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
2040 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
2033 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2034 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
20412035 ; AVX2-NEXT: retq
20422036 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
20432037 ret <8 x i32> %shuffle