llvm.org GIT mirror llvm / 7f35455
When performing a truncating store, it's possible to rearrange the data in-register, such that we can use a single vector store rather then a series of scalar stores. For func_4_8 the generated code vldr d16, LCPI0_0 vmov d17, r0, r1 vadd.i16 d16, d17, d16 vmov.u16 r0, d16[3] strb r0, [r2, #3] vmov.u16 r0, d16[2] strb r0, [r2, #2] vmov.u16 r0, d16[1] strb r0, [r2, #1] vmov.u16 r0, d16[0] strb r0, [r2] bx lr becomes vldr d16, LCPI0_0 vmov d17, r0, r1 vadd.i16 d16, d17, d16 vuzp.8 d16, d17 vst1.32 {d16[0]}, [r2, :32] bx lr I'm not fond of how this combine pessimizes 2012-03-13-DAGCombineBug.ll, but I couldn't think of a way to judiciously apply this combine. This ldrh r0, [r0, #4] strh r0, [r1] becomes vldr d16, [r0] vmov.u16 r0, d16[2] vmov.32 d16[0], r0 vuzp.16 d16, d17 vst1.32 {d16[0]}, [r1, :32] PR11158 rdar://10703339 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154340 91177308-0d34-0410-b5e6-96231b3b80d8 Chad Rosier 8 years ago
4 changed file(s) with 107 addition(s) and 7 deletion(s). Raw diff Collapse all Expand all
73387338 static SDValue PerformSTORECombine(SDNode *N,
73397339 TargetLowering::DAGCombinerInfo &DCI) {
73407340 StoreSDNode *St = cast(N);
7341 if (St->isVolatile())
7342 return SDValue();
7343
7344 // Optimize trunc store (of multiple scalars) to shuffle and store. First,
7345 // pack all of the elements in one place. Next, store to memory in fewer
7346 // chunks.
73417347 SDValue StVal = St->getValue();
7342 if (!ISD::isNormalStore(St) || St->isVolatile())
7348 EVT VT = StVal.getValueType();
7349 if (St->isTruncatingStore() && VT.isVector()) {
7350 SelectionDAG &DAG = DCI.DAG;
7351 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7352 EVT StVT = St->getMemoryVT();
7353 unsigned NumElems = VT.getVectorNumElements();
7354 assert(StVT != VT && "Cannot truncate to the same type");
7355 unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
7356 unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
7357
7358 // From, To sizes and ElemCount must be pow of two
7359 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
7360
7361 // We are going to use the original vector elt for storing.
7362 // Accumulated smaller vector elements must be a multiple of the store size.
7363 if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
7364
7365 unsigned SizeRatio = FromEltSz / ToEltSz;
7366 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
7367
7368 // Create a type on which we perform the shuffle.
7369 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
7370 NumElems*SizeRatio);
7371 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
7372
7373 DebugLoc DL = St->getDebugLoc();
7374 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
7375 SmallVector ShuffleVec(NumElems * SizeRatio, -1);
7376 for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
7377
7378 // Can't shuffle using an illegal type.
7379 if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
7380
7381 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
7382 DAG.getUNDEF(WideVec.getValueType()),
7383 ShuffleVec.data());
7384 // At this point all of the data is stored at the bottom of the
7385 // register. We now need to save it to mem.
7386
7387 // Find the largest store unit
7388 MVT StoreType = MVT::i8;
7389 for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
7390 tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
7391 MVT Tp = (MVT::SimpleValueType)tp;
7392 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
7393 StoreType = Tp;
7394 }
7395 // Didn't find a legal store type.
7396 if (!TLI.isTypeLegal(StoreType))
7397 return SDValue();
7398
7399 // Bitcast the original vector into a vector of store-size units
7400 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
7401 StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
7402 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
7403 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
7404 SmallVector Chains;
7405 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
7406 TLI.getPointerTy());
7407 SDValue BasePtr = St->getBasePtr();
7408
7409 // Perform one or more big stores into memory.
7410 unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
7411 for (unsigned I = 0; I < E; I++) {
7412 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
7413 StoreType, ShuffWide,
7414 DAG.getIntPtrConstant(I));
7415 SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
7416 St->getPointerInfo(), St->isVolatile(),
7417 St->isNonTemporal(), St->getAlignment());
7418 BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
7419 Increment);
7420 Chains.push_back(Ch);
7421 }
7422 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
7423 Chains.size());
7424 }
7425
7426 if (!ISD::isNormalStore(St))
73437427 return SDValue();
73447428
73457429 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
55 ; (i32 extload $addr+c*sizeof(i16)
66 define void @test_hi_short3(<3 x i16> * nocapture %srcA, <2 x i16> * nocapture %dst) nounwind {
77 entry:
8 ; CHECK: ldrh [[REG:r[0-9]+]]
9 ; CHECK: strh [[REG]]
8 ; CHECK: vst1.32
109 %0 = load <3 x i16> * %srcA, align 8
1110 %1 = shufflevector <3 x i16> %0, <3 x i16> undef, <2 x i32>
1211 store <2 x i16> %1, <2 x i16> * %dst, align 4
0 ; RUN: llc -mcpu=cortex-a9 -mtriple=arm-linux-unknown -promote-elements -mattr=+neon < %s | FileCheck %s
1
2 ; CHECK: func_4_8
3 ; CHECK: vst1.32
4 ; CHECK-NEXT: bx lr
5 define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
6 %r = add <4 x i8> %param,
7 store <4 x i8> %r, <4 x i8>* %p
8 ret void
9 }
10
11 ; CHECK: func_2_16
12 ; CHECK: vst1.32
13 ; CHECK-NEXT: bx lr
14 define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) {
15 %r = add <2 x i16> %param,
16 store <2 x i16> %r, <2 x i16>* %p
17 ret void
18 }
148148 }
149149
150150 ; The type <2 x i16> is legalized to <2 x i32> and need to be trunc-stored
151 ; to <2 x i16> when stored to memory. Currently ARM scalarizes these stores.
152 ; See PR 11158
151 ; to <2 x i16> when stored to memory.
153152 define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
154153 ; CHECK: test_vrev64:
155 ; CHECK: vst1.16
156 ; CHECK: vst1.16
154 ; CHECK: vst1.32
157155 entry:
158156 %0 = bitcast <4 x i16>* %source to <8 x i16>*
159157 %tmp2 = load <8 x i16>* %0, align 4