llvm.org GIT mirror llvm / 4ac9081
This commit contains a few changes that had to go in together. 1. Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) (and also scalar_to_vector). 2. Xor/and/or are indifferent to the swizzle operation (shuffle of one src). Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A, B)) 3. Optimize swizzles of shuffles: shuff(shuff(x, y), undef) -> shuff(x, y). 4. Fix an X86ISelLowering optimization which was very bitcast-sensitive. Code which was previously compiled to this: movd (%rsi), %xmm0 movdqa .LCPI0_0(%rip), %xmm2 pshufb %xmm2, %xmm0 movd (%rdi), %xmm1 pshufb %xmm2, %xmm1 pxor %xmm0, %xmm1 pshufb .LCPI0_1(%rip), %xmm1 movd %xmm1, (%rdi) ret Now compiles to this: movl (%rsi), %eax xorl %eax, (%rdi) ret git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@153848 91177308-0d34-0410-b5e6-96231b3b80d8 Nadav Rotem 8 years ago
8 changed file(s) with 127 addition(s) and 22 deletion(s). Raw diff Collapse all Expand all
23352335 ORNode, N0.getOperand(1));
23362336 }
23372337
2338 // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
2339 // Only perform this optimization after type legalization and before
2340 // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
2341 // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
2342 // we don't want to undo this promotion.
2343 // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
2344 // on scalars.
2345 if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
2346 && Level == AfterLegalizeVectorOps) {
2347 SDValue In0 = N0.getOperand(0);
2348 SDValue In1 = N1.getOperand(0);
2349 EVT In0Ty = In0.getValueType();
2350 EVT In1Ty = In1.getValueType();
2351 // If both incoming values are integers, and the original types are the same.
2352 if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
2353 SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), In0Ty, In0, In1);
2354 SDValue BC = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, Op);
2355 AddToWorkList(Op.getNode());
2356 return BC;
2357 }
2358 }
2359
2360 // Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
2361 // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
2362 // If both shuffles use the same mask, and both shuffle within a single
2363 // vector, then it is worthwhile to move the swizzle after the operation.
2364 // The type-legalizer generates this pattern when loading illegal
2365 // vector types from memory. In many cases this allows additional shuffle
2366 // optimizations.
2367 if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
2368 ShuffleVectorSDNode *SVN0 = cast(N0);
2369 ShuffleVectorSDNode *SVN1 = cast(N1);
2370 SDValue In0 = SVN0->getOperand(0);
2371 SDValue In1 = SVN1->getOperand(0);
2372 EVT In0Ty = In0.getValueType();
2373 EVT In1Ty = In1.getValueType();
2374
2375 unsigned NumElts = VT.getVectorNumElements();
2376 // Check that both shuffles are swizzles.
2377 bool SingleVecShuff = (N0.getOperand(1).getOpcode() == ISD::UNDEF &&
2378 N1.getOperand(1).getOpcode() == ISD::UNDEF);
2379
2380 // Check that both shuffles use the same mask. The masks are known to be of
2381 // the same length because the result vector type is the same.
2382 bool SameMask = true;
2383 for (unsigned i = 0; i != NumElts; ++i) {
2384 int Idx0 = SVN0->getMaskElt(i);
2385 int Idx1 = SVN1->getMaskElt(i);
2386 if (Idx0 != Idx1) {
2387 SameMask = false;
2388 break;
2389 }
2390 }
2391
2392 if (SameMask && SingleVecShuff && In0Ty == In1Ty) {
2393 SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT, In0, In1);
2394 SDValue Shuff = DAG.getVectorShuffle(VT, N->getDebugLoc(), Op,
2395 DAG.getUNDEF(VT), &SVN0->getMask()[0]);
2396 AddToWorkList(Op.getNode());
2397 return Shuff;
2398 }
2399 }
23382400 return SDValue();
23392401 }
23402402
77207782 return N0;
77217783 }
77227784 }
7785
7786 // If this shuffle node is simply a swizzle of another shuffle node,
7787 // optimize shuffle(shuffle(x, y), undef) -> shuffle(x, y).
7788 if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
7789 N1.getOpcode() == ISD::UNDEF) {
7790
7791 SmallVector NewMask;
7792 ShuffleVectorSDNode *OtherSV = cast(N0);
7793
7794 EVT InVT = N0.getValueType();
7795 int InNumElts = InVT.getVectorNumElements();
7796
7797 for (unsigned i = 0; i != NumElts; ++i) {
7798 int Idx = SVN->getMaskElt(i);
7799 // If we access the second (undef) operand then this index can be
7800 // canonicalized to undef as well.
7801 if (Idx >= InNumElts)
7802 Idx = -1;
7803 // Next, this index comes from the first value, which is the incoming
7804 // shuffle. Adopt the incoming index.
7805 if (Idx >= 0)
7806 Idx = OtherSV->getMaskElt(Idx);
7807
7808 NewMask.push_back(Idx);
7809 }
7810
7811 return DAG.getVectorShuffle(VT, N->getDebugLoc(), OtherSV->getOperand(0),
7812 OtherSV->getOperand(1), &NewMask[0]);
7813 }
7814
77237815 return SDValue();
77247816 }
77257817
1399913999 return SDValue();
1400014000
1400114001 // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
14002 if (Mask.getOpcode() != ISD::BITCAST ||
14003 X.getOpcode() != ISD::BITCAST ||
14004 Y.getOpcode() != ISD::BITCAST)
14005 return SDValue();
14006
1400714002 // Look through mask bitcast.
14008 Mask = Mask.getOperand(0);
14003 if (Mask.getOpcode() == ISD::BITCAST)
14004 Mask = Mask.getOperand(0);
14005 if (X.getOpcode() == ISD::BITCAST)
14006 X = X.getOperand(0);
14007 if (Y.getOpcode() == ISD::BITCAST)
14008 Y = Y.getOperand(0);
14009
1400914010 EVT MaskVT = Mask.getValueType();
1401014011
1401114012 // Validate that the Mask operand is a vector sra node.
1402614027 // Now we know we at least have a plendvb with the mask val. See if
1402714028 // we can form a psignb/w/d.
1402814029 // psign = x.type == y.type == mask.type && y = sub(0, x);
14029 X = X.getOperand(0);
14030 Y = Y.getOperand(0);
1403114030 if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
1403214031 ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
1403314032 X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
272272 entry:
273273 ; CHECK: t10:
274274 ; CHECK: vmov.i32 q[[Q0:[0-9]+]], #0x3f000000
275 ; CHECK: vmul.f32 q8, q8, d0[0]
275 ; CHECK: vmul.f32 q8, q8, d[[DREG:[0-1]+]]
276276 ; CHECK: vadd.f32 q8, q8, q8
277277 %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
278278 %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
0 ; RUN: llc < %s -march=cellspu -o %t1.s
1 ; RUN: grep rot %t1.s | count 86
1 ; RUN: grep rot %t1.s | count 85
22 ; RUN: grep roth %t1.s | count 8
33 ; RUN: grep roti.*5 %t1.s | count 1
44 ; RUN: grep roti.*27 %t1.s | count 1
22 target triple = "x86_64-unknown-linux-gnu"
33
44 ;CHECK: ltstore
5 ;CHECK: pshufd
6 ;CHECK: pshufd
7 ;CHECK: ret
8 define void @ltstore() {
5 ;CHECK: movq
6 ;CHECK-NEXT: movq
7 ;CHECK-NEXT: ret
8 define void @ltstore(<4 x i32>* %pIn, <2 x i32>* %pOut) {
99 entry:
10 %in = load <4 x i32>* undef
10 %in = load <4 x i32>* %pIn
1111 %j = shufflevector <4 x i32> %in, <4 x i32> undef, <2 x i32>
12 store <2 x i32> %j, <2 x i32>* undef
12 store <2 x i32> %j, <2 x i32>* %pOut
1313 ret void
1414 }
1515
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
1
2 ; Check that we perform a scalar XOR on i32.
3
4 ; CHECK: pull_bitcast
5 ; CHECK: xorl
6 ; CHECK: ret
7 define void @pull_bitcast (<4 x i8>* %pA, <4 x i8>* %pB) {
8 %A = load <4 x i8>* %pA
9 %B = load <4 x i8>* %pB
10 %C = xor <4 x i8> %A, %B
11 store <4 x i8> %C, <4 x i8>* %pA
12 ret void
13 }
2626 define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
2727 entry:
2828 ; CHECK: t02
29 ; CHECK: movaps
30 ; CHECK: shufps
31 ; CHECK: pshufd
32 ; CHECK: movq
33 ; CHECK: ret
29 ; CHECK: mov
30 ; CHECK-NEXT: mov
31 ; CHECK-NEXT: mov
32 ; CHECK-NEXT: mov
33 ; CHECK-NEXT: ret
3434 %0 = bitcast <8 x i32>* %source to <4 x i32>*
3535 %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
3636 %tmp2 = load <4 x i32>* %arrayidx, align 16
3232 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
3333 entry:
3434 ; CHECK: shuf3:
35 ; CHECK: shufps
35 ; CHECK: shufd
3636 %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32>
3737 %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32>
3838 %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32>