llvm.org GIT mirror llvm / 376642e
Some enhancements for memcpy / memset inline expansion. 1. Teach it to use overlapping unaligned load / store to copy / set the trailing bytes. e.g. On 86, use two pairs of movups / movaps for 17 - 31 byte copies. 2. Use f64 for memcpy / memset on targets where i64 is not legal but f64 is. e.g. x86 and ARM. 3. When memcpy from a constant string, do *not* replace the load with a constant if it's not possible to materialize an integer immediate with a single instruction (required a new target hook: TLI.isIntImmLegal()). 4. Use unaligned load / stores more aggressively if target hooks indicates they are "fast". 5. Update ARM target hooks to use unaligned load / stores. e.g. vld1.8 / vst1.8. Also increase the threshold to something reasonable (8 for memset, 4 pairs for memcpy). This significantly improves Dhrystone, up to 50% on ARM iOS devices. rdar://12760078 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169791 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 7 years ago
15 changed file(s) with 301 addition(s) and 87 deletion(s). Raw diff Collapse all Expand all
368368 /// the FP immediate as a load from a constant pool.
369369 virtual bool isFPImmLegal(const APFloat &/*Imm*/, EVT /*VT*/) const {
370370 return false;
371 }
372
373 /// isIntImmLegal - Returns true if the target can instruction select the
374 /// specified integer immediate natively (that is, it's materialized with one
375 /// instruction). The current *assumption* in isel is all of integer
376 /// immediates are "legal" and only the memcpy / memset expansion code is
377 /// making use of this. The rest of isel doesn't have proper cost model for
378 /// immediate materialization.
379 virtual bool isIntImmLegal(const APInt &/*Imm*/, EVT /*VT*/) const {
380 return true;
371381 }
372382
373383 /// isShuffleMaskLegal - Targets can use this to indicate that they only
677687 }
678688
679689 /// This function returns true if the target allows unaligned memory accesses.
680 /// of the specified type. This is used, for example, in situations where an
681 /// array copy/move/set is converted to a sequence of store operations. It's
682 /// use helps to ensure that such replacements don't generate code that causes
683 /// an alignment error (trap) on the target machine.
690 /// of the specified type. If true, it also returns whether the unaligned
691 /// memory access is "fast" in the second argument by reference. This is used,
692 /// for example, in situations where an array copy/move/set is converted to a
693 /// sequence of store operations. It's use helps to ensure that such
694 /// replacements don't generate code that causes an alignment error (trap) on
695 /// the target machine.
684696 /// @brief Determine if the target supports unaligned memory accesses.
685 virtual bool allowsUnalignedMemoryAccesses(EVT) const {
697 virtual bool allowsUnalignedMemoryAccesses(EVT, bool *Fast = 0) const {
686698 return false;
687699 }
688700
33723372 unsigned NumVTBytes = VT.getSizeInBits() / 8;
33733373 unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
33743374
3375 uint64_t Val = 0;
3375 APInt Val(NumBytes*8, 0);
33763376 if (TLI.isLittleEndian()) {
33773377 for (unsigned i = 0; i != NumBytes; ++i)
33783378 Val |= (uint64_t)(unsigned char)Str[i] << i*8;
33813381 Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
33823382 }
33833383
3384 return DAG.getConstant(Val, VT);
3384 if (TLI.isIntImmLegal(Val, VT))
3385 return DAG.getConstant(Val, VT);
3386 return SDValue(0, 0);
33853387 }
33863388
33873389 /// getMemBasePlusOffset - Returns base and offset node for the
34213423 unsigned DstAlign, unsigned SrcAlign,
34223424 bool IsZeroVal,
34233425 bool MemcpyStrSrc,
3426 bool AllowOverlap,
34243427 SelectionDAG &DAG,
34253428 const TargetLowering &TLI) {
34263429 assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
34603463
34613464 unsigned NumMemOps = 0;
34623465 while (Size != 0) {
3466 if (++NumMemOps > Limit)
3467 return false;
3468
34633469 unsigned VTSize = VT.getSizeInBits() / 8;
34643470 while (VTSize > Size) {
34653471 // For now, only use non-vector load / store's for the left-over pieces.
3472 EVT NewVT;
3473 unsigned NewVTSize;
34663474 if (VT.isVector() || VT.isFloatingPoint()) {
3467 VT = MVT::i64;
3468 while (!TLI.isTypeLegal(VT))
3469 VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
3470 VTSize = VT.getSizeInBits() / 8;
3475 NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
3476 while (!TLI.isOperationLegalOrCustom(ISD::STORE, NewVT)) {
3477 if (NewVT == MVT::i64 &&
3478 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64)) {
3479 // i64 is usually not legal on 32-bit targets, but f64 may be.
3480 NewVT = MVT::f64;
3481 break;
3482 }
3483 NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
3484 }
3485 NewVTSize = NewVT.getSizeInBits() / 8;
34713486 } else {
34723487 // This can result in a type that is not legal on the target, e.g.
34733488 // 1 or 2 bytes on PPC.
3474 VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
3475 VTSize >>= 1;
3489 NewVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
3490 NewVTSize = VTSize >> 1;
34763491 }
3477 }
3478
3479 if (++NumMemOps > Limit)
3480 return false;
3492
3493 // If the new VT cannot cover all of the remaining bits, then consider
3494 // issuing a (or a pair of) unaligned and overlapping load / store.
3495 // FIXME: Only does this for 64-bit or more since we don't have proper
3496 // cost model for unaligned load / store.
3497 bool Fast;
3498 if (AllowOverlap && VTSize >= 8 && NewVTSize < Size &&
3499 TLI.allowsUnalignedMemoryAccesses(VT, &Fast) && Fast)
3500 VTSize = Size;
3501 else {
3502 VT = NewVT;
3503 VTSize = NewVTSize;
3504 }
3505 }
3506
34813507 MemOps.push_back(VT);
34823508 Size -= VTSize;
34833509 }
35223548 if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
35233549 (DstAlignCanChange ? 0 : Align),
35243550 (isZeroStr ? 0 : SrcAlign),
3525 true, CopyFromStr, DAG, TLI))
3551 true, CopyFromStr, true, DAG, TLI))
35263552 return SDValue();
35273553
35283554 if (DstAlignCanChange) {
35443570 unsigned VTSize = VT.getSizeInBits() / 8;
35453571 SDValue Value, Store;
35463572
3573 if (VTSize > Size) {
3574 // Issuing an unaligned load / store pair that overlaps with the previous
3575 // pair. Adjust the offset accordingly.
3576 assert(i == NumMemOps-1 && i != 0);
3577 SrcOff -= VTSize - Size;
3578 DstOff -= VTSize - Size;
3579 }
3580
35473581 if (CopyFromStr &&
35483582 (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
35493583 // It's unlikely a store of a vector immediate can be done in a single
35523586 // FIXME: Handle other cases where store of vector immediate is done in
35533587 // a single instruction.
35543588 Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
3555 Store = DAG.getStore(Chain, dl, Value,
3556 getMemBasePlusOffset(Dst, DstOff, DAG),
3557 DstPtrInfo.getWithOffset(DstOff), isVol,
3558 false, Align);
3559 } else {
3589 if (Value.getNode())
3590 Store = DAG.getStore(Chain, dl, Value,
3591 getMemBasePlusOffset(Dst, DstOff, DAG),
3592 DstPtrInfo.getWithOffset(DstOff), isVol,
3593 false, Align);
3594 }
3595
3596 if (!Store.getNode()) {
35603597 // The type might not be legal for the target. This should only happen
35613598 // if the type is smaller than a legal type, as on PPC, so the right
35623599 // thing to do is generate a LoadExt/StoreTrunc pair. These simplify
35763613 OutChains.push_back(Store);
35773614 SrcOff += VTSize;
35783615 DstOff += VTSize;
3616 Size -= VTSize;
35793617 }
35803618
35813619 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
36123650
36133651 if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
36143652 (DstAlignCanChange ? 0 : Align),
3615 SrcAlign, true, false, DAG, TLI))
3653 SrcAlign, true, false, false, DAG, TLI))
36163654 return SDValue();
36173655
36183656 if (DstAlignCanChange) {
36883726 isa(Src) && cast(Src)->isNullValue();
36893727 if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
36903728 Size, (DstAlignCanChange ? 0 : Align), 0,
3691 IsZeroVal, false, DAG, TLI))
3729 IsZeroVal, false, true, DAG, TLI))
36923730 return SDValue();
36933731
36943732 if (DstAlignCanChange) {
37153753
37163754 for (unsigned i = 0; i < NumMemOps; i++) {
37173755 EVT VT = MemOps[i];
3756 unsigned VTSize = VT.getSizeInBits() / 8;
3757 if (VTSize > Size) {
3758 // Issuing an unaligned load / store pair that overlaps with the previous
3759 // pair. Adjust the offset accordingly.
3760 assert(i == NumMemOps-1 && i != 0);
3761 DstOff -= VTSize - Size;
3762 }
37183763
37193764 // If this store is smaller than the largest store see whether we can get
37203765 // the smaller value for free with a truncate.
37333778 isVol, false, Align);
37343779 OutChains.push_back(Store);
37353780 DstOff += VT.getSizeInBits() / 8;
3781 Size -= VTSize;
37363782 }
37373783
37383784 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
832832 setSchedulingPreference(Sched::Hybrid);
833833
834834 //// temporary - rewrite interface to use type
835 maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1;
836 maxStoresPerMemset = 16;
835 maxStoresPerMemset = 8;
837836 maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
837 maxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
838 maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
839 maxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
840 maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
838841
839842 // On ARM arguments smaller than 4 bytes are extended, so all arguments
840843 // are at least 4 bytes aligned.
94059408 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
94069409 }
94079410
9408 bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
9411 bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
94099412 // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
94109413 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
94119414
94149417 return false;
94159418 case MVT::i8:
94169419 case MVT::i16:
9417 case MVT::i32:
9420 case MVT::i32: {
94189421 // Unaligned access can use (for example) LRDB, LRDH, LDR
9419 return AllowsUnaligned;
9422 if (AllowsUnaligned) {
9423 if (Fast)
9424 *Fast = Subtarget->hasV7Ops();
9425 return true;
9426 }
9427 return false;
9428 }
94209429 case MVT::f64:
9421 case MVT::v2f64:
9430 case MVT::v2f64: {
94229431 // For any little-endian targets with neon, we can support unaligned ld/st
94239432 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
94249433 // A big-endian target may also explictly support unaligned accesses
9425 return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian());
9434 if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
9435 if (Fast)
9436 *Fast = true;
9437 return true;
9438 }
9439 return false;
9440 }
94269441 }
94279442 }
94289443
94419456
94429457 // See if we can use NEON instructions for this...
94439458 if (IsZeroVal &&
9444 !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat) &&
9445 Subtarget->hasNEON()) {
9446 if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) {
9447 return MVT::v4i32;
9448 } else if (memOpAlign(SrcAlign, DstAlign, 8) && Size >= 8) {
9449 return MVT::v2i32;
9459 Subtarget->hasNEON() &&
9460 !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
9461 bool Fast;
9462 if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) ||
9463 (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) &&
9464 Fast))) {
9465 return MVT::v2f64;
9466 } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) ||
9467 (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) &&
9468 Fast))) {
9469 return MVT::f64;
94509470 }
94519471 }
94529472
1024010260 return false;
1024110261 }
1024210262
10263 bool ARMTargetLowering::isIntImmLegal(const APInt &Imm, EVT VT) const {
10264 if (VT.getSizeInBits() > 32)
10265 return false;
10266
10267 int32_t ImmVal = Imm.getSExtValue();
10268 if (!Subtarget->isThumb()) {
10269 return (ImmVal >= 0 && ImmVal < 65536) ||
10270 (ARM_AM::getSOImmVal(ImmVal) != -1) ||
10271 (ARM_AM::getSOImmVal(~ImmVal) != -1);
10272 } else if (Subtarget->isThumb2()) {
10273 return (ImmVal >= 0 && ImmVal < 65536) ||
10274 (ARM_AM::getT2SOImmVal(ImmVal) != -1) ||
10275 (ARM_AM::getT2SOImmVal(~ImmVal) != -1);
10276 } else /*Thumb1*/ {
10277 return (ImmVal >= 0 && ImmVal < 256);
10278 }
10279 }
10280
1024310281 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
1024410282 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
1024510283 /// specified in the intrinsic calls.
284284 bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const;
285285
286286 /// allowsUnalignedMemoryAccesses - Returns true if the target allows
287 /// unaligned memory accesses. of the specified type.
288 virtual bool allowsUnalignedMemoryAccesses(EVT VT) const;
287 /// unaligned memory accesses of the specified type. Returns whether it
288 /// is "fast" by reference in the second argument.
289 virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
289290
290291 virtual EVT getOptimalMemOpType(uint64_t Size,
291292 unsigned DstAlign, unsigned SrcAlign,
384385 /// specified FP immediate natively. If false, the legalizer will
385386 /// materialize the FP immediate as a load from a constant pool.
386387 virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
388
389 virtual bool isIntImmLegal(const APInt &Imm, EVT VT) const;
387390
388391 virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
389392 const CallInst &I,
23142314 /// changed to modify CPSR.
23152315 multiclass T2I_un_irs opcod, string opc,
23162316 InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
2317 PatFrag opnode, bit Cheap = 0, bit ReMat = 0> {
2317 PatFrag opnode,
2318 bit Cheap = 0, bit ReMat = 0, bit MoveImm = 0> {
23182319 // shifted imm
23192320 def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
23202321 opc, "\t$Rd, $imm",
23212322 [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
23222323 let isAsCheapAsAMove = Cheap;
23232324 let isReMaterializable = ReMat;
2325 let isMoveImm = MoveImm;
23242326 let Inst{31-27} = 0b11110;
23252327 let Inst{25} = 0;
23262328 let Inst{24-21} = opcod;
23542356 let AddedComplexity = 1 in
23552357 defm t2MVN : T2I_un_irs <0b0011, "mvn",
23562358 IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
2357 UnOpFrag<(not node:$Src)>, 1, 1>;
2359 UnOpFrag<(not node:$Src)>, 1, 1, 1>;
23582360
23592361 let AddedComplexity = 1 in
23602362 def : T2Pat<(and rGPR:$src, t2_so_imm_not:$imm),
456456 maxStoresPerMemcpy = 16;
457457 }
458458
459 bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
459 bool
460 MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
460461 MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
461462
462463 if (Subtarget->inMips16Mode())
465466 switch (SVT) {
466467 case MVT::i64:
467468 case MVT::i32:
469 if (Fast)
470 *Fast = true;
468471 return true;
469472 default:
470473 return false;
148148
149149 virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
150150
151 virtual bool allowsUnalignedMemoryAccesses (EVT VT) const;
151 virtual bool allowsUnalignedMemoryAccesses (EVT VT, bool *Fast) const;
152152
153153 virtual void LowerOperationWrapper(SDNode *N,
154154 SmallVectorImpl &Results,
14091409 if (Subtarget->is64Bit() && Size >= 8)
14101410 return MVT::i64;
14111411 return MVT::i32;
1412 }
1413
1414 bool
1415 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
1416 if (Fast)
1417 *Fast = Subtarget->isUnalignedMemAccessFast();
1418 return true;
14121419 }
14131420
14141421 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
506506 MachineFunction &MF) const;
507507
508508 /// allowsUnalignedMemoryAccesses - Returns true if the target allows
509 /// unaligned memory accesses. of the specified type.
510 virtual bool allowsUnalignedMemoryAccesses(EVT VT) const {
511 return true;
512 }
509 /// unaligned memory accesses. of the specified type. Returns whether it
510 /// is "fast" by reference in the second argument.
511 virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
513512
514513 /// LowerOperation - Provide custom lowering hooks for some operations.
515514 ///
0 ; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
1
2 ; Should trigger a NEON store.
3 ; CHECK: vstr
4 define void @f_0_12(i8* nocapture %c) nounwind optsize {
5 entry:
6 call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
7 ret void
8 }
91
102 ; Trigger multiple NEON stores.
113 ; CHECK: vst1.64
None ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -disable-post-ra | FileCheck %s
1
2 ; CHECK: ldrd
3 ; CHECK: strd
4 ; CHECK: ldrb
0 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
51
62 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
73
84 @src = external global %struct.x
95 @dst = external global %struct.x
106
11 define i32 @t() {
7 @.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
8 @.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
9 @.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
10 @.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1
11 @.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
12 @.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
13 @spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
14
15 define i32 @t0() {
1216 entry:
17 ; CHECK: t0:
18 ; CHECK: vldr [[REG1:d[0-9]+]],
19 ; CHECK: vstr [[REG1]],
1320 call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
1421 ret i32 0
1522 }
1623
24 define void @t1(i8* nocapture %C) nounwind {
25 entry:
26 ; CHECK: t1:
27 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
28 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
29 ; CHECK: adds r0, #15
30 ; CHECK: adds r1, #15
31 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
32 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
33 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
34 ret void
35 }
36
37 define void @t2(i8* nocapture %C) nounwind {
38 entry:
39 ; CHECK: t2:
40 ; CHECK: ldr [[REG2:r[0-9]+]], [r1, #32]
41 ; CHECK: str [[REG2]], [r0, #32]
42 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
43 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
44 ; CHECK: adds r0, #16
45 ; CHECK: adds r1, #16
46 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
47 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
48 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
49 ret void
50 }
51
52 define void @t3(i8* nocapture %C) nounwind {
53 entry:
54 ; CHECK: t3:
55 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
56 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
57 ; CHECK: adds r0, #16
58 ; CHECK: adds r1, #16
59 ; CHECK: vld1.8 {d{{[0-9]+}}}, [r1]
60 ; CHECK: vst1.8 {d{{[0-9]+}}}, [r0]
61 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
62 ret void
63 }
64
65 define void @t4(i8* nocapture %C) nounwind {
66 entry:
67 ; CHECK: t4:
68 ; CHECK: vld1.8 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
69 ; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]
70 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
71 ret void
72 }
73
74 define void @t5(i8* nocapture %C) nounwind {
75 entry:
76 ; CHECK: t5:
77 ; CHECK: movs [[REG5:r[0-9]+]], #0
78 ; CHECK: strb [[REG5]], [r0, #6]
79 ; CHECK: movw [[REG6:r[0-9]+]], #21587
80 ; CHECK: strh [[REG6]], [r0, #4]
81 ; CHECK: ldr [[REG7:r[0-9]+]],
82 ; CHECK: str [[REG7]]
83 tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
84 ret void
85 }
86
87 define void @t6() nounwind {
88 entry:
89 ; CHECK: t6:
90 ; CHECK: vld1.8 {[[REG8:d[0-9]+]]}, [r0]
91 ; CHECK: vstr [[REG8]], [r1]
92 ; CHECK: adds r1, #6
93 ; CHECK: adds r0, #6
94 ; CHECK: vld1.8
95 ; CHECK: vst1.16
96 call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
97 ret void
98 }
99
100 %struct.Foo = type { i32, i32, i32, i32 }
101
102 define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
103 entry:
104 ; CHECK: t7
105 ; CHECK: vld1.32
106 ; CHECK: vst1.32
107 %0 = bitcast %struct.Foo* %a to i8*
108 %1 = bitcast %struct.Foo* %b to i8*
109 tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
110 ret void
111 }
112
17113 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
114 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
0 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
1
2 define void @t1(i8* nocapture %c) nounwind optsize {
3 entry:
4 ; CHECK: t1:
5 ; CHECK: movs r1, #0
6 ; CHECK: str r1, [r0]
7 ; CHECK: str r1, [r0, #4]
8 ; CHECK: str r1, [r0, #8]
9 call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
10 ret void
11 }
12
13 define void @t2() nounwind ssp {
14 entry:
15 ; CHECK: t2:
16 ; CHECK: add.w r1, r0, #10
17 ; CHECK: vmov.i32 {{q[0-9]+}}, #0x0
18 ; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
19 ; CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
20 %buf = alloca [26 x i8], align 1
21 %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
22 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
23 call void @something(i8* %0) nounwind
24 ret void
25 }
26
27 declare void @something(i8*) nounwind
28 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
29 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+0
-16
test/CodeGen/ARM/reg_asc_order.ll less more
None ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
1 ; Check that memcpy gets lowered to ldm/stm, at least in this very smple case.
2
3 %struct.Foo = type { i32, i32, i32, i32 }
4
5 define void @_Z10CopyStructP3FooS0_(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
6 entry:
7 ;CHECK: ldm
8 ;CHECK: stm
9 %0 = bitcast %struct.Foo* %a to i8*
10 %1 = bitcast %struct.Foo* %b to i8*
11 tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
12 ret void
13 }
14
15 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
55 define void @t(i32 %count) ssp nounwind {
66 entry:
77 ; CHECK: t:
8 ; CHECK: movq ___stack_chk_guard@GOTPCREL(%rip)
9 ; CHECK: movups L_str(%rip), %xmm0
8 ; CHECK: movups L_str+12(%rip), %xmm0
9 ; CHECK: movups L_str(%rip), %xmm1
1010 %tmp0 = alloca [60 x i8], align 1
1111 %tmp1 = getelementptr inbounds [60 x i8]* %tmp0, i64 0, i64 0
1212 br label %bb1
1313
1414 bb1:
1515 ; CHECK: LBB0_1:
16 ; CHECK: movaps %xmm0, (%rsp)
16 ; CHECK: movups %xmm0, 12(%rsp)
17 ; CHECK: movaps %xmm1, (%rsp)
1718 %tmp2 = phi i32 [ %tmp3, %bb1 ], [ 0, %entry ]
1819 call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* getelementptr inbounds ([28 x i8]* @str, i64 0, i64 0), i64 28, i32 1, i1 false)
1920 %tmp3 = add i32 %tmp2, 1
99 define void @t1(i32 %argc, i8** %argv) nounwind {
1010 entry:
1111 ; SSE2: t1:
12 ; SSE2: movsd _.str+16, %xmm0
13 ; SSE2: movsd %xmm0, 16(%esp)
1214 ; SSE2: movaps _.str, %xmm0
1315 ; SSE2: movaps %xmm0
14 ; SSE2: movb $0
15 ; SSE2: movl $0
16 ; SSE2: movl $0
16 ; SSE2: movb $0, 24(%esp)
1717
1818 ; SSE1: t1:
19 ; SSE1: fldl _.str+16
20 ; SSE1: fstpl 16(%esp)
1921 ; SSE1: movaps _.str, %xmm0
2022 ; SSE1: movaps %xmm0
21 ; SSE1: movb $0
22 ; SSE1: movl $0
23 ; SSE1: movl $0
23 ; SSE1: movb $0, 24(%esp)
2424
2525 ; NOSSE: t1:
2626 ; NOSSE: movb $0