llvm.org GIT mirror llvm / ba95865
On Sandybridge split unaligned 256bit stores into two xmm-sized stores. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172894 91177308-0d34-0410-b5e6-96231b3b80d8 Nadav Rotem 7 years ago
9 changed file(s) with 56 addition(s) and 36 deletion(s). Raw diff Collapse all Expand all
1634316343
1634416344 ISD::LoadExtType Ext = Ld->getExtensionType();
1634516345 unsigned Alignment = Ld->getAlignment();
16346 bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8;
1634616347
1634716348 // On Sandybridge unaligned 256bit loads are inefficient.
1634816349 if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
16349 !DCI.isBeforeLegalizeOps() && Alignment < 32 &&
16350 Ext == ISD::NON_EXTLOAD) {
16350 !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
1635116351 unsigned NumElems = RegVT.getVectorNumElements();
16352 if (NumElems < 2)
16353 return SDValue();
16354
1635216355 SDValue Ptr = Ld->getBasePtr();
1635316356 SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
1635416357
1636216365 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
1636316366 Ld->getPointerInfo(), Ld->isVolatile(),
1636416367 Ld->isNonTemporal(), Ld->isInvariant(),
16365 Alignment);
16368 std::max(Alignment/2U, 1U));
1636616369 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1636716370 Load1.getValue(1),
1636816371 Load2.getValue(1));
1653516538 DebugLoc dl = St->getDebugLoc();
1653616539 SDValue StoredVal = St->getOperand(1);
1653716540 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16541 unsigned Alignment = St->getAlignment();
16542 bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8;
1653816543
1653916544 // If we are saving a concatenation of two XMM registers, perform two stores.
1654016545 // On Sandy Bridge, 256-bit memory operations are executed by two
1654116546 // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
1654216547 // memory operation.
1654316548 if (VT.is256BitVector() && !Subtarget->hasInt256() &&
16544 StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
16545 StoredVal.getNumOperands() == 2) {
16546 SDValue Value0 = StoredVal.getOperand(0);
16547 SDValue Value1 = StoredVal.getOperand(1);
16549 StVT == VT && !IsAligned) {
16550 unsigned NumElems = VT.getVectorNumElements();
16551 if (NumElems < 2)
16552 return SDValue();
16553
16554 SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
16555 SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
1654816556
1654916557 SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
1655016558 SDValue Ptr0 = St->getBasePtr();
1655216560
1655316561 SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
1655416562 St->getPointerInfo(), St->isVolatile(),
16555 St->isNonTemporal(), St->getAlignment());
16563 St->isNonTemporal(), Alignment);
1655616564 SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
1655716565 St->getPointerInfo(), St->isVolatile(),
16558 St->isNonTemporal(), St->getAlignment());
16566 St->isNonTemporal(),
16567 std::max(Alignment/2U, 1U));
1655916568 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
1656016569 }
1656116570
11
22 ;CHECK: add18i16
33 define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind {
4 ;CHECK: vmovups
4 ;CHECK: vmovaps
55 %b = load <18 x i16>* %bp, align 16
66 %x = add <18 x i16> zeroinitializer, %b
77 store <18 x i16> %x, <18 x i16>* %ret, align 16
4141
4242 ; Move the constants using a single vector store.
4343 ; CHECK: merge_const_store_vec
44 ; CHECK: vmovups %ymm0, (%rsi)
44 ; CHECK: vmovups
4545 ; CHECK: ret
4646 define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
4747 %1 = icmp sgt i32 %count, 0
5252 unreachable
5353 }
5454
55 ; CHECK: vmovups %ymm
55 ; CHECK: storev16i16_01
56 ; CHECK: vextractf128
57 ; CHECK: vmovaps %xmm
5658 define void @storev16i16_01(<16 x i16> %a) nounwind {
5759 store <16 x i16> %a, <16 x i16>* undef, align 4
5860 unreachable
5961 }
6062
63 ; CHECK: storev32i8
6164 ; CHECK: vmovaps %ymm
6265 define void @storev32i8(<32 x i8> %a) nounwind {
6366 store <32 x i8> %a, <32 x i8>* undef, align 32
6467 unreachable
6568 }
6669
67 ; CHECK: vmovups %ymm
70 ; CHECK: storev32i8_01
71 ; CHECK: vextractf128
72 ; CHECK: vmovups %xmm
6873 define void @storev32i8_01(<32 x i8> %a) nounwind {
6974 store <32 x i8> %a, <32 x i8>* undef, align 4
7075 unreachable
7580 ; CHECK: _double_save
7681 ; CHECK-NOT: vinsertf128 $1
7782 ; CHECK-NOT: vinsertf128 $0
78 ; CHECK: vmovaps %xmm
83 ; CHECK: vmovups %xmm
7984 ; CHECK: vmovaps %xmm
8085 define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
8186 entry:
185185 ret void
186186 }
187187
188 ; AVX: sext_5
189 ; AVX: vpmovsxbw
190 ; AVX: vpmovsxwd
191 ; AVX: vpmovsxwd
192 ; AVX: vpmovsxdq
193 ; AVX: ret
194 define void @sext_5(<8 x i8>* %inbuf, <8 x i64>* %outbuf) {
195 %v0 = load <8 x i8>* %inbuf
196 %r = sext <8 x i8> %v0 to <8 x i64>
197 store <8 x i64> %r, <8 x i64>* %outbuf
198 ret void
199 }
200188 ; AVX: sext_6
201189 ; AVX: vpmovsxbw
202190 ; AVX: vpmovsxwd
4848 ; CHECK: movlhps
4949 ; CHECK: ret
5050 ; AVX: test4
51 ; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}})
52 ; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}})
51 ; AVX: vcvtpd2psy
52 ; AVX: vcvtpd2psy
5353 ; AVX: vinsertf128
5454 ; AVX: ret
5555 %x = load <8 x double>* %p
22 ;CHECK: wideloads
33 ;CHECK: vmovaps
44 ;CHECK: vinsertf128
5 ;CHECK: vmovups
5 ;CHECK: vmovaps
66 ;CHECK-NOT: vinsertf128
77 ;CHECK: ret
88
1010 %v0 = load <8 x float>* %a, align 16 ; <---- unaligned!
1111 %v1 = load <8 x float>* %b, align 32 ; <---- aligned!
1212 %m0 = fcmp olt <8 x float> %v1, %v0
13 %v2 = load <8 x float>* %c, align 16
13 %v2 = load <8 x float>* %c, align 32 ; <---- aligned!
1414 %m1 = fcmp olt <8 x float> %v2, %v0
1515 %mand = and <8 x i1> %m1, %m0
1616 %r = zext <8 x i1> %mand to <8 x i32>
17 store <8 x i32> %r, <8 x i32>* undef, align 16
17 store <8 x i32> %r, <8 x i32>* undef, align 32
1818 ret void
1919 }
2020
21 ; CHECK: widestores
22 ; loads:
23 ; CHECK: vmovaps
24 ; CHECK: vmovaps
25 ; stores:
26 ; CHECK: vmovaps
27 ; CHECK: vextractf128
28 ; CHECK: vmovaps
29 ;CHECK: ret
30
31 define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
32 %v0 = load <8 x float>* %a, align 32
33 %v1 = load <8 x float>* %b, align 32
34 store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned
35 store <8 x float> %v1, <8 x float>* %a, align 16 ; <--- unaligned
36 ret void
37 }
38
55 ;CHECK: vcmpltp
66 ;CHECK: vandps
77 ;CHECK: vandps
8 ;CHECK: vmovups
8 ;CHECK: vmovaps
99 ;CHECK: ret
1010
1111 define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
1616 %m1 = fcmp olt <8 x float> %v2, %v0
1717 %mand = and <8 x i1> %m1, %m0
1818 %r = zext <8 x i1> %mand to <8 x i32>
19 store <8 x i32> %r, <8 x i32>* undef, align 16
19 store <8 x i32> %r, <8 x i32>* undef, align 32
2020 ret void
2121 }
2222
2424 ;CHECK: vcmpltps
2525 ;CHECK: vxorps
2626 ;CHECK: vandps
27 ;CHECK: vmovups
27 ;CHECK: vmovaps
2828 ;CHECK: ret
2929 define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
3030 %v0 = load <8 x float>* %a, align 16
3232 %m0 = fcmp olt <8 x float> %v1, %v0
3333 %mand = xor <8 x i1> %m0,
3434 %r = zext <8 x i1> %mand to <8 x i32>
35 store <8 x i32> %r, <8 x i32>* undef, align 16
35 store <8 x i32> %r, <8 x i32>* undef, align 32
3636 ret void
3737 }
3838
2828 ; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
2929 ; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
3030 ; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
31 ; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
3132 ; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
32 ; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
3333 %0 = load <8 x float>* %in
3434 %1 = fpext <8 x float> %0 to <8 x double>
3535 store <8 x double> %1, <8 x double>* %out, align 1