llvm.org GIT mirror llvm / 47abf0e
[X86][SSE] Improved (v)insertps shuffle matching In the current code we only attempt to match against insertps if we have exactly one element from the second input vector, irrespective of how much of the shuffle result is zeroable. This patch checks to see if there is a single non-zeroable element from either input that requires insertion. It also supports matching of cases where only one of the inputs need to be referenced. We also split insertps shuffle matching off into a new lowerVectorShuffleAsInsertPS function. Differential Revision: http://reviews.llvm.org/D6879 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225589 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 5 years ago
4 changed file(s) with 111 addition(s) and 61 deletion(s). Raw diff Collapse all Expand all
81628162 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
81638163 }
81648164
8165 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8166 // INSERTPS when the V1 elements are already in the correct locations
8167 // because otherwise we can just always use two SHUFPS instructions which
8168 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8169 // perform INSERTPS if a single V1 element is out of place and all V2
8170 // elements are zeroable.
8171 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8172 ArrayRef Mask,
8173 SelectionDAG &DAG) {
8174 assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8175 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8176 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8177 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8178
8179 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8180
8181 unsigned ZMask = 0;
8182 int V1DstIndex = -1;
8183 int V2DstIndex = -1;
8184 bool V1UsedInPlace = false;
8185
8186 for (int i = 0; i < 4; i++) {
8187 // Synthesize a zero mask from the zeroable elements (includes undefs).
8188 if (Zeroable[i]) {
8189 ZMask |= 1 << i;
8190 continue;
8191 }
8192
8193 // Flag if we use any V1 inputs in place.
8194 if (i == Mask[i]) {
8195 V1UsedInPlace = true;
8196 continue;
8197 }
8198
8199 // We can only insert a single non-zeroable element.
8200 if (V1DstIndex != -1 || V2DstIndex != -1)
8201 return SDValue();
8202
8203 if (Mask[i] < 4) {
8204 // V1 input out of place for insertion.
8205 V1DstIndex = i;
8206 } else {
8207 // V2 input for insertion.
8208 V2DstIndex = i;
8209 }
8210 }
8211
8212 // Don't bother if we have no (non-zeroable) element for insertion.
8213 if (V1DstIndex == -1 && V2DstIndex == -1)
8214 return SDValue();
8215
8216 // Determine element insertion src/dst indices. The src index is from the
8217 // start of the inserted vector, not the start of the concatenated vector.
8218 unsigned V2SrcIndex = 0;
8219 if (V1DstIndex != -1) {
8220 // If we have a V1 input out of place, we use V1 as the V2 element insertion
8221 // and don't use the original V2 at all.
8222 V2SrcIndex = Mask[V1DstIndex];
8223 V2DstIndex = V1DstIndex;
8224 V2 = V1;
8225 } else {
8226 V2SrcIndex = Mask[V2DstIndex] - 4;
8227 }
8228
8229 // If no V1 inputs are used in place, then the result is created only from
8230 // the zero mask and the V2 insertion - so remove V1 dependency.
8231 if (!V1UsedInPlace)
8232 V1 = DAG.getUNDEF(MVT::v4f32);
8233
8234 unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8235 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8236
8237 // Insert the V2 element into the desired position.
8238 SDLoc DL(Op);
8239 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8240 DAG.getConstant(InsertPSMask, MVT::i8));
8241 }
8242
81658243 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
81668244 ///
81678245 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
84678545 Mask, Subtarget, DAG))
84688546 return V;
84698547
8470 if (Subtarget->hasSSE41())
8548 if (Subtarget->hasSSE41()) {
84718549 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
84728550 Subtarget, DAG))
84738551 return Blend;
84748552
8475 // Check for whether we can use INSERTPS to perform the blend. We only use
8476 // INSERTPS when the V1 elements are already in the correct locations
8477 // because otherwise we can just always use two SHUFPS instructions which
8478 // are much smaller to encode than a SHUFPS and an INSERTPS.
8479 if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
8480 int V2Index =
8481 std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8482 Mask.begin();
8483
8484 // When using INSERTPS we can zero any lane of the destination. Collect
8485 // the zero inputs into a mask and drop them from the lanes of V1 which
8486 // actually need to be present as inputs to the INSERTPS.
8487 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8488
8489 // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
8490 bool InsertNeedsShuffle = false;
8491 unsigned ZMask = 0;
8492 for (int i = 0; i < 4; ++i)
8493 if (i != V2Index) {
8494 if (Zeroable[i]) {
8495 ZMask |= 1 << i;
8496 } else if (Mask[i] != i) {
8497 InsertNeedsShuffle = true;
8498 break;
8499 }
8500 }
8501
8502 // We don't want to use INSERTPS or other insertion techniques if it will
8503 // require shuffling anyways.
8504 if (!InsertNeedsShuffle) {
8505 // If all of V1 is zeroable, replace it with undef.
8506 if ((ZMask | 1 << V2Index) == 0xF)
8507 V1 = DAG.getUNDEF(MVT::v4f32);
8508
8509 unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
8510 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8511
8512 // Insert the V2 element into the desired position.
8513 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8514 DAG.getConstant(InsertPSMask, MVT::i8));
8515 }
8553 // Use INSERTPS if we can complete the shuffle efficiently.
8554 if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8555 return V;
85168556 }
85178557
85188558 // Otherwise fall back to a SHUFPS lowering strategy.
239239 ; CHECK-LABEL: test19:
240240 ; CHECK: # BB#0:
241241 ; CHECK-NEXT: xorps %xmm2, %xmm2
242 ; CHECK-NEXT: xorps %xmm3, %xmm3
243 ; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3]
244 ; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
245 ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
246 ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2]
247 ; CHECK-NEXT: orps %xmm3, %xmm2
242 ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,3]
243 ; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
244 ; CHECK-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,2]
245 ; CHECK-NEXT: orps %xmm1, %xmm2
248246 ; CHECK-NEXT: movaps %xmm2, %xmm0
249247 ; CHECK-NEXT: retq
250248 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
7070 ; AVX2-LABEL: test5
7171 ; AVX2: vmaskmovpd
7272 ; AVX2: vblendvpd
73 ; AVX2: vmaskmovpd
73 ; AVX2: vmaskmovpd
7474 ; AVX2: vblendvpd
7575 define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
7676 %mask = icmp eq <8 x i32> %trigger, zeroinitializer
149149 }
150150
151151 ; AVX2-LABEL: test14
152 ; AVX2: vshufps $-24
152 ; AVX2: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
153153 ; AVX2: vmaskmovps
154154 define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
155155 %mask = icmp eq <2 x i32> %trigger, zeroinitializer
193193 }
194194
195195
196 declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
196 declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
197197 declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
198198 declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
199199 declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
201201 declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
202202 declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
203203 declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
204 declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
205 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
204 declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
205 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
206206 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
207207 declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
208208 declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
552552 }
553553
554554 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
555 ; SSE-LABEL: combine_bitwise_ops_test3c:
556 ; SSE: # BB#0:
557 ; SSE-NEXT: xorps %xmm1, %xmm0
558 ; SSE-NEXT: xorps %xmm1, %xmm1
559 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
560 ; SSE-NEXT: retq
555 ; SSE2-LABEL: combine_bitwise_ops_test3c:
556 ; SSE2: # BB#0:
557 ; SSE2-NEXT: xorps %xmm1, %xmm0
558 ; SSE2-NEXT: xorps %xmm1, %xmm1
559 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
560 ; SSE2-NEXT: retq
561 ;
562 ; SSSE3-LABEL: combine_bitwise_ops_test3c:
563 ; SSSE3: # BB#0:
564 ; SSSE3-NEXT: xorps %xmm1, %xmm0
565 ; SSSE3-NEXT: xorps %xmm1, %xmm1
566 ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
567 ; SSSE3-NEXT: retq
568 ;
569 ; SSE41-LABEL: combine_bitwise_ops_test3c:
570 ; SSE41: # BB#0:
571 ; SSE41-NEXT: xorps %xmm1, %xmm0
572 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
573 ; SSE41-NEXT: retq
561574 ;
562575 ; AVX-LABEL: combine_bitwise_ops_test3c:
563576 ; AVX: # BB#0:
564577 ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
565 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
566 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
578 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
567579 ; AVX-NEXT: retq
568580 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32>
569581 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32>