llvm.org GIT mirror llvm / 72f7b08
A DAGCombine optimization for merging consecutive stores. This optimization is not profitable in many cases because moden processos can store multiple values in parallel, and preparing the consecutive store requires some work. We only handle these cases: 1. Consecutive stores where the values and consecutive loads. For example: int a = p->a; int b = p->b; q->a = a; q->b = b; 2. Consecutive stores where the values are constants. Foe example: q->a = 4; q->b = 5; git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@164885 91177308-0d34-0410-b5e6-96231b3b80d8 Nadav Rotem 7 years ago
5 changed file(s) with 410 addition(s) and 12 deletion(s). Raw diff Collapse all Expand all
300300 /// looking for a better chain (aliasing node.)
301301 SDValue FindBetterChain(SDNode *N, SDValue Chain);
302302
303 /// Merge consecutive store operations into a wide store.
304 /// \return True if some memory operations were changed.
305 bool MergeConsecutiveStores(StoreSDNode *N);
306
303307 public:
304308 DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
305309 : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
74367440 return SDValue();
74377441 }
74387442
7443 /// Returns the base pointer and an integer offset from that object.
7444 static std::pair GetPointerBaseAndOffset(SDValue Ptr) {
7445 if (Ptr->getOpcode() == ISD::ADD && isa(Ptr->getOperand(1))) {
7446 int64_t Offset = cast(Ptr->getOperand(1))->getSExtValue();
7447 SDValue Base = Ptr->getOperand(0);
7448 return std::make_pair(Base, Offset);
7449 }
7450
7451 return std::make_pair(Ptr, 0);
7452 }
7453
7454 struct ConsecutiveMemoryChainSorter {
7455 typedef std::pair MemLink;
7456 bool operator()(MemLink LHS, MemLink RHS) {
7457 return LHS.second < RHS.second;
7458 }
7459 };
7460
7461 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
7462 EVT MemVT = St->getMemoryVT();
7463 int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
7464
7465 // Don't handle vectors.
7466 if (MemVT.isVector() || !MemVT.isSimple())
7467 return false;
7468
7469 // Perform an early exit check. Do not bother looking at stored values that
7470 // are not constants or loads.
7471 SDValue StoredVal = St->getValue();
7472 if (!isa(StoredVal) && !isa(StoredVal) &&
7473 !isa(StoredVal))
7474 return false;
7475
7476 // Is this a load-to-store or a const-store.
7477 bool IsLoadSrc = isa(StoredVal);
7478
7479 // Only look at ends of store chains.
7480 SDValue Chain = SDValue(St, 1);
7481 if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
7482 return false;
7483
7484 // This holds the base pointer and the offset in bytes from the base pointer.
7485 std::pair BasePtr =
7486 GetPointerBaseAndOffset(St->getBasePtr());
7487
7488 // We must have a base and an offset.
7489 if (!BasePtr.first.getNode())
7490 return false;
7491
7492 // Do not handle stores to undef base pointers.
7493 if (BasePtr.first.getOpcode() == ISD::UNDEF)
7494 return false;
7495
7496 SmallVector, 8> StoreNodes;
7497 // Walk up the chain and look for nodes with offsets from the same
7498 // base pointer. Stop when reaching an instruction with a different kind
7499 // or instruction which has a different base pointer.
7500 StoreSDNode *Index = St;
7501 while (Index) {
7502 // If the chain has more than one use, then we can't reorder the mem ops.
7503 if (Index != St && !SDValue(Index, 1)->hasOneUse())
7504 break;
7505
7506 // Find the base pointer and offset for this memory node.
7507 std::pair Ptr =
7508 GetPointerBaseAndOffset(Index->getBasePtr());
7509
7510 // Check that the base pointer is the same as the original one.
7511 if (Ptr.first.getNode() != BasePtr.first.getNode())
7512 break;
7513
7514 // Check that the alignment is the same.
7515 if (Index->getAlignment() != St->getAlignment())
7516 break;
7517
7518 // The memory operands must not be volatile.
7519 if (Index->isVolatile() || Index->isIndexed())
7520 break;
7521
7522 // No truncation.
7523 if (StoreSDNode *St = dyn_cast(Index))
7524 if (St->isTruncatingStore())
7525 break;
7526
7527 // The stored memory type must be the same.
7528 if (Index->getMemoryVT() != MemVT)
7529 break;
7530
7531 // We found a potential memory operand to merge.
7532 StoreNodes.push_back(std::make_pair(Index,Ptr.second));
7533
7534 // Move up the chain to the next memory operation.
7535 Index = dyn_cast(Index->getChain().getNode());
7536 }
7537
7538 // Check if there is anything to merge.
7539 if (StoreNodes.size() < 2)
7540 return false;
7541
7542 // Remember which node is the earliest node in the chain.
7543 LSBaseSDNode *EarliestOp = StoreNodes.back().first;
7544
7545 // Sort the memory operands according to their distance from the base pointer.
7546 std::sort(StoreNodes.begin(), StoreNodes.end(),
7547 ConsecutiveMemoryChainSorter());
7548
7549 // Scan the memory operations on the chain and find the first non-consecutive
7550 // store memory address.
7551 unsigned LastConsecutiveStore = 0;
7552 int64_t StartAddress = StoreNodes[0].second;
7553 for (unsigned i=1; i
7554 int64_t CurrAddress = StoreNodes[i].second;
7555 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
7556 break;
7557 LastConsecutiveStore = i;
7558 }
7559
7560 // Store the constants into memory as one consecutive store.
7561 if (!IsLoadSrc) {
7562 unsigned LastConst = 0;
7563 for (unsigned i=0; i
7564 SDValue StoredVal = StoreNodes[i].first->getValue();
7565 bool IsConst = (isa(StoredVal) || isa(StoredVal));
7566 if (!IsConst)
7567 break;
7568 LastConst = i;
7569 }
7570 unsigned NumElem = std::min(LastConsecutiveStore + 1, LastConst + 1);
7571 if (NumElem < 2)
7572 return false;
7573
7574 EVT JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
7575 DebugLoc DL = StoreNodes[0].first->getDebugLoc();
7576 SmallVector Ops;
7577
7578 for (unsigned i = 0; i < NumElem ; ++i) {
7579 StoreSDNode *St = cast(StoreNodes[i].first);
7580 Ops.push_back(St->getValue());
7581 }
7582
7583 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL,
7584 JointMemOpVT, &Ops[0], Ops.size());
7585
7586 SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, BV,
7587 EarliestOp->getBasePtr(),
7588 EarliestOp->getPointerInfo(), false, false,
7589 EarliestOp->getAlignment());
7590
7591 for (unsigned i = 0; i < NumElem ; ++i) {
7592 StoreSDNode *St = cast(StoreNodes[i].first);
7593 CombineTo(St, NewStore);
7594 }
7595 return true;
7596 }
7597
7598 // Look for load nodes wich are used by the stored values.
7599 SmallVector, 8> LoadNodes;
7600
7601 // Find acceptible loads. Loads need to have the same chain (token factor),
7602 // must not be zext, volatile, indexed, and they must be consecutive.
7603 SDValue LdBasePtr;
7604 for (unsigned i=0; i
7605 LoadSDNode *Ld = dyn_cast(StoreNodes[i].first->getValue());
7606 if (!Ld) break;
7607
7608 // Loads must only have one use.
7609 if (!Ld->hasNUsesOfValue(1, 0))
7610 break;
7611
7612 // Check that the alignment is the same as the stores.
7613 if (Ld->getAlignment() != St->getAlignment())
7614 break;
7615
7616 // The memory operands must not be volatile.
7617 if (Ld->isVolatile() || Ld->isIndexed())
7618 break;
7619
7620 if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
7621 break;
7622
7623 // The stored memory type must be the same.
7624 if (Ld->getMemoryVT() != MemVT)
7625 break;
7626
7627 std::pair LdPtr =
7628 GetPointerBaseAndOffset(Ld->getBasePtr());
7629
7630 // If this is not the first ptr that we check.
7631 if (LdBasePtr.getNode()) {
7632 // The base ptr must be the same,
7633 if (LdPtr.first != LdBasePtr)
7634 break;
7635 } else {
7636 LdBasePtr = LdPtr.first;
7637 }
7638
7639 // We found a potential memory operand to merge.
7640 LoadNodes.push_back(std::make_pair(Ld, LdPtr.second));
7641 }
7642
7643 if (LoadNodes.size() < 2)
7644 return false;
7645
7646 // Scan the memory operations on the chain and find the first non-consecutive
7647 // load memory address.
7648 unsigned LastConsecutiveLoad = 0;
7649 StartAddress = LoadNodes[0].second;
7650 for (unsigned i=1; i
7651 int64_t CurrAddress = LoadNodes[i].second;
7652 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
7653 break;
7654 LastConsecutiveLoad = i;
7655 }
7656
7657 unsigned NumElem =
7658 std::min(LastConsecutiveStore + 1, LastConsecutiveLoad + 1);
7659
7660 EVT JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
7661 DebugLoc LoadDL = LoadNodes[0].first->getDebugLoc();
7662 DebugLoc StoreDL = StoreNodes[0].first->getDebugLoc();
7663
7664 LoadSDNode *FirstLoad = LoadNodes[0].first;
7665 SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL,
7666 FirstLoad->getChain(),
7667 FirstLoad->getBasePtr(),
7668 FirstLoad->getPointerInfo(),
7669 false, false, false,
7670 FirstLoad->getAlignment());
7671
7672 SDValue NewStore = DAG.getStore(EarliestOp->getChain(), StoreDL, NewLoad,
7673 EarliestOp->getBasePtr(),
7674 EarliestOp->getPointerInfo(), false, false,
7675 EarliestOp->getAlignment());
7676
7677 for (unsigned i = 0; i < NumElem ; ++i) {
7678 StoreSDNode *St = cast(StoreNodes[i].first);
7679 CombineTo(St, NewStore);
7680 }
7681
7682 return true;
7683 }
7684
74397685 SDValue DAGCombiner::visitSTORE(SDNode *N) {
74407686 StoreSDNode *ST = cast(N);
74417687 SDValue Chain = ST->getChain();
76377883 ST->getAlignment());
76387884 }
76397885
7886
7887 // Only perform this optimization before the types are legal, because we
7888 // don't want to generate illegal types in this optimization.
7889 if (!LegalTypes && MergeConsecutiveStores(ST))
7890 return SDValue(N, 0);
7891
76407892 return ReduceLoadOpStoreWidth(N);
76417893 }
76427894
0 ; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
3 target triple = "x86_64-apple-macosx10.8.0"
4
5 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
6
7 @a = common global [10000 x %struct.A] zeroinitializer, align 8
8
9 ; Move all of the constants using a single vector store.
10 ; CHECK: merge_const_store
11 ; CHECK: movq %xmm0
12 ; CHECK: ret
13 define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
14 %1 = icmp sgt i32 %count, 0
15 br i1 %1, label %.lr.ph, label %._crit_edge
16 .lr.ph:
17 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
18 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
19 %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
20 store i8 1, i8* %2, align 1
21 %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
22 store i8 2, i8* %3, align 1
23 %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
24 store i8 3, i8* %4, align 1
25 %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
26 store i8 4, i8* %5, align 1
27 %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
28 store i8 5, i8* %6, align 1
29 %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
30 store i8 6, i8* %7, align 1
31 %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
32 store i8 7, i8* %8, align 1
33 %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
34 store i8 8, i8* %9, align 1
35 %10 = add nsw i32 %i.02, 1
36 %11 = getelementptr inbounds %struct.A* %.01, i64 1
37 %exitcond = icmp eq i32 %10, %count
38 br i1 %exitcond, label %._crit_edge, label %.lr.ph
39 ._crit_edge:
40 ret void
41 }
42
43 ; Move the first 4 constants as a single vector. Move the rest as scalars.
44 ; CHECK: merge_nonconst_store
45 ; CHECK: movd %xmm0
46 ; CHECK: movb
47 ; CHECK: movb
48 ; CHECK: movb
49 ; CHECK: movb
50 ; CHECK: ret
51 define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
52 %1 = icmp sgt i32 %count, 0
53 br i1 %1, label %.lr.ph, label %._crit_edge
54 .lr.ph:
55 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
56 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
57 %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
58 store i8 1, i8* %2, align 1
59 %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
60 store i8 2, i8* %3, align 1
61 %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
62 store i8 3, i8* %4, align 1
63 %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
64 store i8 4, i8* %5, align 1
65 %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
66 store i8 %zz, i8* %6, align 1 ; <----------- Not a const;
67 %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
68 store i8 6, i8* %7, align 1
69 %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
70 store i8 7, i8* %8, align 1
71 %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
72 store i8 8, i8* %9, align 1
73 %10 = add nsw i32 %i.02, 1
74 %11 = getelementptr inbounds %struct.A* %.01, i64 1
75 %exitcond = icmp eq i32 %10, %count
76 br i1 %exitcond, label %._crit_edge, label %.lr.ph
77 ._crit_edge:
78 ret void
79 }
80
81
82 ;CHECK: merge_loads
83 ; load:
84 ;CHECK: movw
85 ; store:
86 ;CHECK: movw
87 ;CHECK: ret
88 define void @merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
89 %1 = icmp sgt i32 %count, 0
90 br i1 %1, label %.lr.ph, label %._crit_edge
91
92 .lr.ph: ; preds = %0
93 %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
94 %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
95 br label %4
96
97 ;
98 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
99 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
100 %5 = load i8* %2, align 1
101 %6 = load i8* %3, align 1
102 %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
103 store i8 %5, i8* %7, align 1
104 %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
105 store i8 %6, i8* %8, align 1
106 %9 = add nsw i32 %i.02, 1
107 %10 = getelementptr inbounds %struct.A* %.01, i64 1
108 %exitcond = icmp eq i32 %9, %count
109 br i1 %exitcond, label %._crit_edge, label %4
110
111 ._crit_edge: ; preds = %4, %0
112 ret void
113 }
114
115 ; The loads and the stores are interleved. Can't merge them.
116 ;CHECK: no_merge_loads
117 ;CHECK: movb
118 ;CHECK: movb
119 ;CHECK: movb
120 ;CHECK: movb
121 ;CHECK: ret
122 define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
123 %1 = icmp sgt i32 %count, 0
124 br i1 %1, label %.lr.ph, label %._crit_edge
125
126 .lr.ph: ; preds = %0
127 %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
128 %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
129 br label %a4
130
131 a4: ; preds = %4, %.lr.ph
132 %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
133 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
134 %a5 = load i8* %2, align 1
135 %a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
136 store i8 %a5, i8* %a7, align 1
137 %a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
138 %a6 = load i8* %3, align 1
139 store i8 %a6, i8* %a8, align 1
140 %a9 = add nsw i32 %i.02, 1
141 %a10 = getelementptr inbounds %struct.A* %.01, i64 1
142 %exitcond = icmp eq i32 %a9, %count
143 br i1 %exitcond, label %._crit_edge, label %a4
144
145 ._crit_edge: ; preds = %4, %0
146 ret void
147 }
148
149
None ; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
1 ; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s -check-prefix=STATIC
0 ; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=pic | FileCheck %s -check-prefix=PIC
1 ; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=static | FileCheck %s -check-prefix=STATIC
22 ;
33 ; Make sure the common loop invariant A is hoisted up to preheader,
44 ; since too many registers are needed to subsume it into the addressing modes.
55 ; It's safe to sink A in when it's not pic.
66
77 ; PIC: align
8 ; PIC: movl $4, -4([[REG:%e[a-z]+]])
9 ; PIC: movl $5, ([[REG]])
8 ; PIC: movlpd %xmm0, -4([[REG:%e[a-z]+]])
109 ; PIC: addl $4, [[REG]]
1110 ; PIC: decl {{%e[[a-z]+}}
1211 ; PIC: jne
1312
1413 ; STATIC: align
15 ; STATIC: movl $4, -4(%ecx)
16 ; STATIC: movl $5, (%ecx)
14 ; STATIC: movlpd %xmm0, -4(%ecx)
1715 ; STATIC: addl $4, %ecx
1816 ; STATIC: decl %eax
1917 ; STATIC: jne
None ; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s
0 ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 -relocation-model=dynamic-no-pic | FileCheck %s
11
22 ; CHECK: align
3 ; CHECK: movl $4, -4(%ecx)
4 ; CHECK: movl $5, (%ecx)
3 ; CHECK: movlpd %xmm0, -4(%ecx)
54 ; CHECK: addl $4, %ecx
65 ; CHECK: decl %eax
76 ; CHECK: jne
None ; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s
0 ; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=static | FileCheck %s
11
22 ; CHECK: align
3 ; CHECK: movl $4, -4(%ecx)
4 ; CHECK: movl $5, (%ecx)
3 ; CHECK: movlpd %xmm0, -4(%ecx)
54 ; CHECK: addl $4, %ecx
65 ; CHECK: decl %eax
76 ; CHECK: jne