llvm.org GIT mirror llvm / 4546272
Speculatively revert commit 164885 (nadav) in the hope of ressurecting a pile of buildbots. Original commit message: A DAGCombine optimization for merging consecutive stores. This optimization is not profitable in many cases because moden processos can store multiple values in parallel, and preparing the consecutive store requires some work. We only handle these cases: 1. Consecutive stores where the values and consecutive loads. For example: int a = p->a; int b = p->b; q->a = a; q->b = b; 2. Consecutive stores where the values are constants. Foe example: q->a = 4; q->b = 5; git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@164890 91177308-0d34-0410-b5e6-96231b3b80d8 Duncan Sands 7 years ago
5 changed file(s) with 12 addition(s) and 410 deletion(s). Raw diff Collapse all Expand all
300300 /// looking for a better chain (aliasing node.)
301301 SDValue FindBetterChain(SDNode *N, SDValue Chain);
302302
303 /// Merge consecutive store operations into a wide store.
304 /// \return True if some memory operations were changed.
305 bool MergeConsecutiveStores(StoreSDNode *N);
306
307303 public:
308304 DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
309305 : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
74267422 return SDValue();
74277423 }
74287424
7429 /// Returns the base pointer and an integer offset from that object.
7430 static std::pair GetPointerBaseAndOffset(SDValue Ptr) {
7431 if (Ptr->getOpcode() == ISD::ADD && isa(Ptr->getOperand(1))) {
7432 int64_t Offset = cast(Ptr->getOperand(1))->getSExtValue();
7433 SDValue Base = Ptr->getOperand(0);
7434 return std::make_pair(Base, Offset);
7435 }
7436
7437 return std::make_pair(Ptr, 0);
7438 }
7439
7440 struct ConsecutiveMemoryChainSorter {
7441 typedef std::pair MemLink;
7442 bool operator()(MemLink LHS, MemLink RHS) {
7443 return LHS.second < RHS.second;
7444 }
7445 };
7446
7447 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
7448 EVT MemVT = St->getMemoryVT();
7449 int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
7450
7451 // Don't handle vectors.
7452 if (MemVT.isVector() || !MemVT.isSimple())
7453 return false;
7454
7455 // Perform an early exit check. Do not bother looking at stored values that
7456 // are not constants or loads.
7457 SDValue StoredVal = St->getValue();
7458 if (!isa(StoredVal) && !isa(StoredVal) &&
7459 !isa(StoredVal))
7460 return false;
7461
7462 // Is this a load-to-store or a const-store.
7463 bool IsLoadSrc = isa(StoredVal);
7464
7465 // Only look at ends of store chains.
7466 SDValue Chain = SDValue(St, 1);
7467 if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
7468 return false;
7469
7470 // This holds the base pointer and the offset in bytes from the base pointer.
7471 std::pair BasePtr =
7472 GetPointerBaseAndOffset(St->getBasePtr());
7473
7474 // We must have a base and an offset.
7475 if (!BasePtr.first.getNode())
7476 return false;
7477
7478 // Do not handle stores to undef base pointers.
7479 if (BasePtr.first.getOpcode() == ISD::UNDEF)
7480 return false;
7481
7482 SmallVector, 8> StoreNodes;
7483 // Walk up the chain and look for nodes with offsets from the same
7484 // base pointer. Stop when reaching an instruction with a different kind
7485 // or instruction which has a different base pointer.
7486 StoreSDNode *Index = St;
7487 while (Index) {
7488 // If the chain has more than one use, then we can't reorder the mem ops.
7489 if (Index != St && !SDValue(Index, 1)->hasOneUse())
7490 break;
7491
7492 // Find the base pointer and offset for this memory node.
7493 std::pair Ptr =
7494 GetPointerBaseAndOffset(Index->getBasePtr());
7495
7496 // Check that the base pointer is the same as the original one.
7497 if (Ptr.first.getNode() != BasePtr.first.getNode())
7498 break;
7499
7500 // Check that the alignment is the same.
7501 if (Index->getAlignment() != St->getAlignment())
7502 break;
7503
7504 // The memory operands must not be volatile.
7505 if (Index->isVolatile() || Index->isIndexed())
7506 break;
7507
7508 // No truncation.
7509 if (StoreSDNode *St = dyn_cast(Index))
7510 if (St->isTruncatingStore())
7511 break;
7512
7513 // The stored memory type must be the same.
7514 if (Index->getMemoryVT() != MemVT)
7515 break;
7516
7517 // We found a potential memory operand to merge.
7518 StoreNodes.push_back(std::make_pair(Index,Ptr.second));
7519
7520 // Move up the chain to the next memory operation.
7521 Index = dyn_cast(Index->getChain().getNode());
7522 }
7523
7524 // Check if there is anything to merge.
7525 if (StoreNodes.size() < 2)
7526 return false;
7527
7528 // Remember which node is the earliest node in the chain.
7529 LSBaseSDNode *EarliestOp = StoreNodes.back().first;
7530
7531 // Sort the memory operands according to their distance from the base pointer.
7532 std::sort(StoreNodes.begin(), StoreNodes.end(),
7533 ConsecutiveMemoryChainSorter());
7534
7535 // Scan the memory operations on the chain and find the first non-consecutive
7536 // store memory address.
7537 unsigned LastConsecutiveStore = 0;
7538 int64_t StartAddress = StoreNodes[0].second;
7539 for (unsigned i=1; i
7540 int64_t CurrAddress = StoreNodes[i].second;
7541 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
7542 break;
7543 LastConsecutiveStore = i;
7544 }
7545
7546 // Store the constants into memory as one consecutive store.
7547 if (!IsLoadSrc) {
7548 unsigned LastConst = 0;
7549 for (unsigned i=0; i
7550 SDValue StoredVal = StoreNodes[i].first->getValue();
7551 bool IsConst = (isa(StoredVal) || isa(StoredVal));
7552 if (!IsConst)
7553 break;
7554 LastConst = i;
7555 }
7556 unsigned NumElem = std::min(LastConsecutiveStore + 1, LastConst + 1);
7557 if (NumElem < 2)
7558 return false;
7559
7560 EVT JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
7561 DebugLoc DL = StoreNodes[0].first->getDebugLoc();
7562 SmallVector Ops;
7563
7564 for (unsigned i = 0; i < NumElem ; ++i) {
7565 StoreSDNode *St = cast(StoreNodes[i].first);
7566 Ops.push_back(St->getValue());
7567 }
7568
7569 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL,
7570 JointMemOpVT, &Ops[0], Ops.size());
7571
7572 SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, BV,
7573 EarliestOp->getBasePtr(),
7574 EarliestOp->getPointerInfo(), false, false,
7575 EarliestOp->getAlignment());
7576
7577 for (unsigned i = 0; i < NumElem ; ++i) {
7578 StoreSDNode *St = cast(StoreNodes[i].first);
7579 CombineTo(St, NewStore);
7580 }
7581 return true;
7582 }
7583
7584 // Look for load nodes wich are used by the stored values.
7585 SmallVector, 8> LoadNodes;
7586
7587 // Find acceptible loads. Loads need to have the same chain (token factor),
7588 // must not be zext, volatile, indexed, and they must be consecutive.
7589 SDValue LdBasePtr;
7590 for (unsigned i=0; i
7591 LoadSDNode *Ld = dyn_cast(StoreNodes[i].first->getValue());
7592 if (!Ld) break;
7593
7594 // Loads must only have one use.
7595 if (!Ld->hasNUsesOfValue(1, 0))
7596 break;
7597
7598 // Check that the alignment is the same as the stores.
7599 if (Ld->getAlignment() != St->getAlignment())
7600 break;
7601
7602 // The memory operands must not be volatile.
7603 if (Ld->isVolatile() || Ld->isIndexed())
7604 break;
7605
7606 if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
7607 break;
7608
7609 // The stored memory type must be the same.
7610 if (Ld->getMemoryVT() != MemVT)
7611 break;
7612
7613 std::pair LdPtr =
7614 GetPointerBaseAndOffset(Ld->getBasePtr());
7615
7616 // If this is not the first ptr that we check.
7617 if (LdBasePtr.getNode()) {
7618 // The base ptr must be the same,
7619 if (LdPtr.first != LdBasePtr)
7620 break;
7621 } else {
7622 LdBasePtr = LdPtr.first;
7623 }
7624
7625 // We found a potential memory operand to merge.
7626 LoadNodes.push_back(std::make_pair(Ld, LdPtr.second));
7627 }
7628
7629 if (LoadNodes.size() < 2)
7630 return false;
7631
7632 // Scan the memory operations on the chain and find the first non-consecutive
7633 // load memory address.
7634 unsigned LastConsecutiveLoad = 0;
7635 StartAddress = LoadNodes[0].second;
7636 for (unsigned i=1; i
7637 int64_t CurrAddress = LoadNodes[i].second;
7638 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
7639 break;
7640 LastConsecutiveLoad = i;
7641 }
7642
7643 unsigned NumElem =
7644 std::min(LastConsecutiveStore + 1, LastConsecutiveLoad + 1);
7645
7646 EVT JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
7647 DebugLoc LoadDL = LoadNodes[0].first->getDebugLoc();
7648 DebugLoc StoreDL = StoreNodes[0].first->getDebugLoc();
7649
7650 LoadSDNode *FirstLoad = LoadNodes[0].first;
7651 SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL,
7652 FirstLoad->getChain(),
7653 FirstLoad->getBasePtr(),
7654 FirstLoad->getPointerInfo(),
7655 false, false, false,
7656 FirstLoad->getAlignment());
7657
7658 SDValue NewStore = DAG.getStore(EarliestOp->getChain(), StoreDL, NewLoad,
7659 EarliestOp->getBasePtr(),
7660 EarliestOp->getPointerInfo(), false, false,
7661 EarliestOp->getAlignment());
7662
7663 for (unsigned i = 0; i < NumElem ; ++i) {
7664 StoreSDNode *St = cast(StoreNodes[i].first);
7665 CombineTo(St, NewStore);
7666 }
7667
7668 return true;
7669 }
7670
76717425 SDValue DAGCombiner::visitSTORE(SDNode *N) {
76727426 StoreSDNode *ST = cast(N);
76737427 SDValue Chain = ST->getChain();
78697623 ST->getAlignment());
78707624 }
78717625
7872
7873 // Only perform this optimization before the types are legal, because we
7874 // don't want to generate illegal types in this optimization.
7875 if (!LegalTypes && MergeConsecutiveStores(ST))
7876 return SDValue(N, 0);
7877
78787626 return ReduceLoadOpStoreWidth(N);
78797627 }
78807628
+0
-150
test/CodeGen/X86/MergeConsecutiveStores.ll less more
None ; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
3 target triple = "x86_64-apple-macosx10.8.0"
4
5 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
6
7 @a = common global [10000 x %struct.A] zeroinitializer, align 8
8
9 ; Move all of the constants using a single vector store.
10 ; CHECK: merge_const_store
11 ; CHECK: movq %xmm0
12 ; CHECK: ret
13 define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
14 %1 = icmp sgt i32 %count, 0
15 br i1 %1, label %.lr.ph, label %._crit_edge
16 .lr.ph:
17 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
18 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
19 %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
20 store i8 1, i8* %2, align 1
21 %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
22 store i8 2, i8* %3, align 1
23 %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
24 store i8 3, i8* %4, align 1
25 %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
26 store i8 4, i8* %5, align 1
27 %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
28 store i8 5, i8* %6, align 1
29 %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
30 store i8 6, i8* %7, align 1
31 %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
32 store i8 7, i8* %8, align 1
33 %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
34 store i8 8, i8* %9, align 1
35 %10 = add nsw i32 %i.02, 1
36 %11 = getelementptr inbounds %struct.A* %.01, i64 1
37 %exitcond = icmp eq i32 %10, %count
38 br i1 %exitcond, label %._crit_edge, label %.lr.ph
39 ._crit_edge:
40 ret void
41 }
42
43 ; Move the first 4 constants as a single vector. Move the rest as scalars.
44 ; CHECK: merge_nonconst_store
45 ; CHECK: movd %xmm0
46 ; CHECK: movb
47 ; CHECK: movb
48 ; CHECK: movb
49 ; CHECK: movb
50 ; CHECK: ret
51 define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
52 %1 = icmp sgt i32 %count, 0
53 br i1 %1, label %.lr.ph, label %._crit_edge
54 .lr.ph:
55 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
56 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
57 %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
58 store i8 1, i8* %2, align 1
59 %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
60 store i8 2, i8* %3, align 1
61 %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
62 store i8 3, i8* %4, align 1
63 %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
64 store i8 4, i8* %5, align 1
65 %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
66 store i8 %zz, i8* %6, align 1 ; <----------- Not a const;
67 %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
68 store i8 6, i8* %7, align 1
69 %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
70 store i8 7, i8* %8, align 1
71 %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
72 store i8 8, i8* %9, align 1
73 %10 = add nsw i32 %i.02, 1
74 %11 = getelementptr inbounds %struct.A* %.01, i64 1
75 %exitcond = icmp eq i32 %10, %count
76 br i1 %exitcond, label %._crit_edge, label %.lr.ph
77 ._crit_edge:
78 ret void
79 }
80
81
82 ;CHECK: merge_loads
83 ; load:
84 ;CHECK: movw
85 ; store:
86 ;CHECK: movw
87 ;CHECK: ret
88 define void @merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
89 %1 = icmp sgt i32 %count, 0
90 br i1 %1, label %.lr.ph, label %._crit_edge
91
92 .lr.ph: ; preds = %0
93 %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
94 %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
95 br label %4
96
97 ;
98 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
99 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
100 %5 = load i8* %2, align 1
101 %6 = load i8* %3, align 1
102 %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
103 store i8 %5, i8* %7, align 1
104 %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
105 store i8 %6, i8* %8, align 1
106 %9 = add nsw i32 %i.02, 1
107 %10 = getelementptr inbounds %struct.A* %.01, i64 1
108 %exitcond = icmp eq i32 %9, %count
109 br i1 %exitcond, label %._crit_edge, label %4
110
111 ._crit_edge: ; preds = %4, %0
112 ret void
113 }
114
115 ; The loads and the stores are interleved. Can't merge them.
116 ;CHECK: no_merge_loads
117 ;CHECK: movb
118 ;CHECK: movb
119 ;CHECK: movb
120 ;CHECK: movb
121 ;CHECK: ret
122 define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
123 %1 = icmp sgt i32 %count, 0
124 br i1 %1, label %.lr.ph, label %._crit_edge
125
126 .lr.ph: ; preds = %0
127 %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
128 %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
129 br label %a4
130
131 a4: ; preds = %4, %.lr.ph
132 %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
133 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
134 %a5 = load i8* %2, align 1
135 %a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
136 store i8 %a5, i8* %a7, align 1
137 %a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
138 %a6 = load i8* %3, align 1
139 store i8 %a6, i8* %a8, align 1
140 %a9 = add nsw i32 %i.02, 1
141 %a10 = getelementptr inbounds %struct.A* %.01, i64 1
142 %exitcond = icmp eq i32 %a9, %count
143 br i1 %exitcond, label %._crit_edge, label %a4
144
145 ._crit_edge: ; preds = %4, %0
146 ret void
147 }
148
149
None ; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=pic | FileCheck %s -check-prefix=PIC
1 ; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=static | FileCheck %s -check-prefix=STATIC
0 ; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
1 ; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s -check-prefix=STATIC
22 ;
33 ; Make sure the common loop invariant A is hoisted up to preheader,
44 ; since too many registers are needed to subsume it into the addressing modes.
55 ; It's safe to sink A in when it's not pic.
66
77 ; PIC: align
8 ; PIC: movlpd %xmm0, -4([[REG:%e[a-z]+]])
8 ; PIC: movl $4, -4([[REG:%e[a-z]+]])
9 ; PIC: movl $5, ([[REG]])
910 ; PIC: addl $4, [[REG]]
1011 ; PIC: decl {{%e[[a-z]+}}
1112 ; PIC: jne
1213
1314 ; STATIC: align
14 ; STATIC: movlpd %xmm0, -4(%ecx)
15 ; STATIC: movl $4, -4(%ecx)
16 ; STATIC: movl $5, (%ecx)
1517 ; STATIC: addl $4, %ecx
1618 ; STATIC: decl %eax
1719 ; STATIC: jne
None ; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 -relocation-model=dynamic-no-pic | FileCheck %s
0 ; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s
11
22 ; CHECK: align
3 ; CHECK: movlpd %xmm0, -4(%ecx)
3 ; CHECK: movl $4, -4(%ecx)
4 ; CHECK: movl $5, (%ecx)
45 ; CHECK: addl $4, %ecx
56 ; CHECK: decl %eax
67 ; CHECK: jne
None ; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=static | FileCheck %s
0 ; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s
11
22 ; CHECK: align
3 ; CHECK: movlpd %xmm0, -4(%ecx)
3 ; CHECK: movl $4, -4(%ecx)
4 ; CHECK: movl $5, (%ecx)
45 ; CHECK: addl $4, %ecx
56 ; CHECK: decl %eax
67 ; CHECK: jne