llvm.org GIT mirror llvm / 807360a
[x86] Combine x86mmx/i64 to v2i64 conversion to use scalar_to_vector Handle the poor codegen for i64/x86xmm->v2i64 (%mm -> %xmm) moves. Instead of using stack store/load pair to do the job, use scalar_to_vector directly, which in the MMX case can use movq2dq. This was the current behavior prior to improvements for vector legalization of extloads in r213897. This commit fixes the regression and as a side-effect also remove some unnecessary shuffles. In the new attached testcase, we go from: pshufw $-18, (%rdi), %mm0 movq %mm0, -8(%rsp) movq -8(%rsp), %xmm0 pshufd $-44, %xmm0, %xmm0 movd %xmm0, %eax ... To: pshufw $-18, (%rdi), %mm0 movq2dq %mm0, %xmm0 movd %xmm0, %eax ... Differential Revision: http://reviews.llvm.org/D7126 rdar://problem/19413324 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@226953 91177308-0d34-0410-b5e6-96231b3b80d8 Bruno Cardoso Lopes 5 years ago
5 changed file(s) with 68 addition(s) and 9 deletion(s). Raw diff Collapse all Expand all
2475624756 LoadSDNode *Ld = cast(N);
2475724757 EVT RegVT = Ld->getValueType(0);
2475824758 EVT MemVT = Ld->getMemoryVT();
24759 SDValue Ptr = Ld->getBasePtr();
24760 SDValue Chain = Ld->getChain();
2475924761 SDLoc dl(Ld);
2476024762 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2476124763
2479224794 NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
2479324795 NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
2479424796 return DCI.CombineTo(N, NewVec, TF, true);
24797 }
24798
24799 // Conversion from x86mmx/i64 to v2i64 types is often done via stack
24800 // store/load. Under certain conditions we can bypass the memory access and
24801 // combine this load to use a scalar_to_vector instead. This leads to
24802 // a reduction in the stack use, redundant emission of shuffles and create
24803 // isel matching candidates for movq2dq instructions.
24804 if (RegVT == MVT::v2i64 && Subtarget->hasSSE2() && Ext == ISD::EXTLOAD &&
24805 !Ld->isVolatile() && ISD::isNON_TRUNCStore(Chain.getNode())) {
24806
24807 // If this load is directly stored, get the original source value.
24808 StoreSDNode *PrevST = cast(Chain);
24809 EVT SrcTy = PrevST->getValue().getValueType();
24810 if (PrevST->getBasePtr() != Ptr ||
24811 !(SrcTy == MVT::i64 || SrcTy == MVT::x86mmx))
24812 return SDValue();
24813 SDValue SrcVal = Chain.getOperand(1);
24814
24815 // On 32bit systems, we can't save 64bit integers, use f64 instead.
24816 bool Usef64 = TLI.isTypeLegal(MVT::f64) && !Subtarget->is64Bit();
24817 if (Usef64)
24818 SrcVal = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SrcVal);
24819 SrcVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, Usef64 ? MVT::v2f64 : RegVT,
24820 SrcVal);
24821
24822 return DCI.CombineTo(N, Usef64 ?
24823 DAG.getNode(ISD::BITCAST, dl, RegVT, SrcVal) : SrcVal, Chain);
2479524824 }
2479624825
2479724826 return SDValue();
0 ; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s
11
2 ;CHECK-LABEL: vcast:
2 ; CHECK-LABEL: vcast:
33 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
4 ;CHECK: pmovzxdq
5 ;CHECK: pmovzxdq
4 ; CHECK-NOT: pmovzxdq
5 ; CHECK-NOT: pmovzxdq
6 ; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]]
67 %af = bitcast <2 x float> %a to <2 x i32>
78 %bf = bitcast <2 x float> %b to <2 x i32>
9 ; CHECK-NEXT: psubq (%{{.*}}), %[[R0]]
810 %x = sub <2 x i32> %af, %bf
9 ;CHECK: psubq
11 ; CHECK: ret
1012 ret <2 x i32> %x
11 ;CHECK: ret
1213 }
1314
6767 %2 = bitcast <2 x i32> %add to i64
6868 ret i64 %2
6969 }
70 ; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
70 ; FIXME: At the moment we still produce the sequence paddd+pshufd.
7171 ; Ideally, we should fold that sequence into a single paddd. This is fixed with
7272 ; the widening legalization.
7373 ;
7474 ; CHECK-LABEL: test4
75 ; CHECK: pshufd
75 ; CHECK: movd
76 ; CHECK-NOT: pshufd
7677 ; CHECK-NEXT: paddd
7778 ; CHECK-NEXT: pshufd
7879 ; CHECK: ret
0 ; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-32
1 ; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s -check-prefix=X86-64
2
3 ; X86-32-LABEL: test0
4 ; X86-64-LABEL: test0
5 define i32 @test0(<1 x i64>* %v4) {
6 %v5 = load <1 x i64>* %v4, align 8
7 %v12 = bitcast <1 x i64> %v5 to <4 x i16>
8 %v13 = bitcast <4 x i16> %v12 to x86_mmx
9 ; X86-32: pshufw $238
10 ; X86-32-NOT: movq
11 ; X86-32-NOT: movsd
12 ; X86-32: movq2dq
13 ; X86-64: pshufw $238
14 ; X86-64-NOT: movq
15 ; X86-64-NOT: pshufd
16 ; X86-64: movq2dq
17 ; X86-64-NEXT: movd
18 %v14 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %v13, i8 -18)
19 %v15 = bitcast x86_mmx %v14 to <4 x i16>
20 %v16 = bitcast <4 x i16> %v15 to <1 x i64>
21 %v17 = extractelement <1 x i64> %v16, i32 0
22 %v18 = bitcast i64 %v17 to <2 x i32>
23 %v19 = extractelement <2 x i32> %v18, i32 0
24 %v20 = add i32 %v19, 32
25 ret i32 %v20
26 }
27
28 declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8)
7777 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
7878 ; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}})
7979 ; CHECK-NEXT: pshufb {{.*}}, %[[R1]]
80 ; CHECK-NEXT: pmovzxdq %[[R1]], %[[R0]]
81 ; CHECK-NEXT: movd %[[R0]], (%{{.*}})
80 ; CHECK-NEXT: movd %[[R1]], (%{{.*}})
8281 %a = load %i16vec3* %ap, align 16
8382 %b = load %i16vec3* %bp, align 16
8483 %x = add %i16vec3 %a, %b