llvm.org GIT mirror llvm / cd2a3f9
[X86][AVX2] bugzilla bug 21281 Performance regression in vector interleave in AVX2 This is a patch for an on-going bugzilla bug 21281 on the generated X86 code for a matrix transpose8x8 subroutine which requires vector interleaving. The generated code in AVX2 is currently non-optimal and requires 60 instructions as opposed to only 40 instructions generated for AVX1. The patch includes a fix for the AVX2 case where vector unpack instructions use less operations than the vector blend operations available in AVX2. In this case using vector unpack instructions is more efficient. Reviewers: zvi delena igorb craig.topper guyblank eladcohen m_zuckerman aymanmus RKSimon git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298840 91177308-0d34-0410-b5e6-96231b3b80d8 Gadi Haber 3 years ago
3 changed file(s) with 67 addition(s) and 56 deletion(s). Raw diff Collapse all Expand all
82068206 return true;
82078207 }
82088208
8209 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8210 // instructions.
8211 static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT) {
8212 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8213 return false;
8214
8215 SmallVector Unpcklwd;
8216 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8217 /* Unary = */ false);
8218 SmallVector Unpckhwd;
8219 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8220 /* Unary = */ false);
8221 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8222 isTargetShuffleEquivalent(Mask, Unpckhwd));
8223 return IsUnpackwdMask;
8224 }
8225
82098226 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
82108227 ///
82118228 /// This helper function produces an 8-bit shuffle immediate corresponding to
1273912756 V1, V2, DAG, Subtarget))
1274012757 return V;
1274112758
12759 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12760 // since after split we get a more efficient code using vpunpcklwd and
12761 // vpunpckhwd instrs than vblend.
12762 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12763 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12764 Mask, DAG))
12765 return V;
12766
1274212767 // If we have AVX2 then we always want to lower with a blend because at v8 we
1274312768 // can fully permute the elements.
1274412769 if (Subtarget.hasAVX2())
1276912794 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
1277012795 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
1277112796 return ZExt;
12797
12798 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12799 // since after split we get a more efficient code than vblend by using
12800 // vpunpcklwd and vpunpckhwd instrs.
12801 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !Subtarget.hasAVX512())
12802 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2,
12803 Mask, DAG))
12804 return V;
1277212805
1277312806 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
1277412807 Zeroable, Subtarget, DAG))
9292 ; AVX2: # BB#0:
9393 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
9494 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
95 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
96 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
97 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
98 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 =
99 ; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm3
100 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
101 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
102 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 =
103 ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1
104 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
105 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3],ymm8[4],ymm1[5],ymm8[6],ymm1[7]
106 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
95 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
96 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
97 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
98 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
99 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
100 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
101 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
107102 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
108 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
109 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
110 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5
111 ; AVX2-NEXT: vpermd %ymm5, %ymm9, %ymm6
112 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
113 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7]
114 ; AVX2-NEXT: vpermd %ymm5, %ymm0, %ymm0
115 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
116 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
117 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
118 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
119 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0
120 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
121 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
122 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
123 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
124 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
125 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
126 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
127 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
128 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
129 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
130 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
131 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
132 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
103 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
104 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
105 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
106 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
107 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
108 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
109 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
110 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
111 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
112 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
113 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
114 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
115 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
116 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
117 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
118 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
119 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
120 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
133121 ; AVX2-NEXT: retq
134122 %ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32>
135123 %cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32>
265265 }
266266
267267 define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
268 ; AVX1-LABEL: shuffle_v8f32_08192a3b:
269 ; AVX1: # BB#0:
270 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
271 ; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
272 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
273 ; AVX1-NEXT: retq
274 ;
275 ; AVX2-LABEL: shuffle_v8f32_08192a3b:
276 ; AVX2: # BB#0:
277 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 =
278 ; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
279 ; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
280 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
281 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
282 ; AVX2-NEXT: retq
268 ; AVX1OR2-LABEL: shuffle_v8f32_08192a3b:
269 ; AVX1OR2: # BB#0:
270 ; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
271 ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
272 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
273 ; AVX1OR2-NEXT: retq
283274 ;
284275 ; AVX512VL-LABEL: shuffle_v8f32_08192a3b:
285276 ; AVX512VL: # BB#0:
12201211 ;
12211212 ; AVX2-LABEL: shuffle_v8i32_08192a3b:
12221213 ; AVX2: # BB#0:
1223 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 =
1224 ; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
1225 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1226 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1214 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1215 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1216 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
12271217 ; AVX2-NEXT: retq
12281218 ;
12291219 ; AVX512VL-LABEL: shuffle_v8i32_08192a3b: