llvm.org GIT mirror llvm / a642560
[SDAG] Make the DAGCombine worklist not grow endlessly due to duplicate insertions. The old behavior could cause arbitrarily bad memory usage in the DAG combiner if there was heavy traffic of adding nodes already on the worklist to it. This commit switches the DAG combine worklist to work the same way as the instcombine worklist where we null-out removed entries and only add new entries to the worklist. My measurements of codegen time shows slight improvement. The memory utilization is unsurprisingly dominated by other factors (the IR and DAG itself I suspect). This change results in subtle, frustrating churn in the particular order in which DAG combines are applied which causes a number of minor regressions where we fail to match a pattern previously matched by accident. AFAICT, all of these should be using AddToWorklist to directly or should be written in a less brittle way. None of the changes seem drastically bad, and a few of the changes seem distinctly better. A major change required to make this work is to significantly harden the way in which the DAG combiner handle nodes which become dead (zero-uses). Previously, we relied on the ability to "priority-bump" them on the combine worklist to achieve recursive deletion of these nodes and ensure that the frontier of remaining live nodes all were added to the worklist. Instead, I've introduced a routine to just implement that precise logic with no indirection. It is a significantly simpler operation than that of the combiner worklist proper. I suspect this will also fix some other problems with the combiner. I think the x86 changes are really minor and uninteresting, but the avx512 change at least is hiding a "regression" (despite the test case being just noise, not testing some performance invariant) that might be looked into. Not sure if any of the others impact specific "important" code paths, but they didn't look terribly interesting to me, or the changes were really minor. The consensus in review is to fix any regressions that show up after the fact here. Thanks to the other reviewers for checking the output on other architectures. There is a specific regression on ARM that Tim already has a fix prepped to commit. Differential Revision: http://reviews.llvm.org/D4616 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213727 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 5 years ago
16 changed file(s) with 95 addition(s) and 275 deletion(s). Raw diff Collapse all Expand all
1717
1818 #include "llvm/CodeGen/SelectionDAG.h"
1919 #include "llvm/ADT/SmallPtrSet.h"
20 #include "llvm/ADT/SetVector.h"
2021 #include "llvm/ADT/Statistic.h"
2122 #include "llvm/Analysis/AliasAnalysis.h"
2223 #include "llvm/CodeGen/MachineFrameInfo.h"
8687 bool LegalTypes;
8788 bool ForCodeSize;
8889
89 // Worklist of all of the nodes that need to be simplified.
90 //
91 // This has the semantics that when adding to the worklist,
92 // the item added must be next to be processed. It should
93 // also only appear once. The naive approach to this takes
94 // linear time.
95 //
96 // To reduce the insert/remove time to logarithmic, we use
97 // a set and a vector to maintain our worklist.
98 //
99 // The set contains the items on the worklist, but does not
100 // maintain the order they should be visited.
101 //
102 // The vector maintains the order nodes should be visited, but may
103 // contain duplicate or removed nodes. When choosing a node to
104 // visit, we pop off the order stack until we find an item that is
105 // also in the contents set. All operations are O(log N).
106 SmallPtrSet WorklistContents;
107 SmallVector WorklistOrder;
90 /// \brief Worklist of all of the nodes that need to be simplified.
91 ///
92 /// This must behave as a stack -- new nodes to process are pushed onto the
93 /// back and when processing we pop off of the back.
94 ///
95 /// The worklist will not contain duplicates but may contain null entries
96 /// due to nodes being deleted from the underlying DAG.
97 SmallVector Worklist;
98
99 /// \brief Mapping from an SDNode to its position on the worklist.
100 ///
101 /// This is used to find and remove nodes from the worklist (by nulling
102 /// them) when they are deleted from the underlying DAG. It relies on
103 /// stable indices of nodes within the worklist.
104 DenseMap WorklistMap;
108105
109106 // AA - Used for DAG load/store alias analysis.
110107 AliasAnalysis &AA;
131128 if (N->getOpcode() == ISD::HANDLENODE)
132129 return;
133130
134 WorklistContents.insert(N);
135 WorklistOrder.push_back(N);
131 if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
132 Worklist.push_back(N);
136133 }
137134
138135 /// removeFromWorklist - remove all instances of N from the worklist.
139136 ///
140137 void removeFromWorklist(SDNode *N) {
141 WorklistContents.erase(N);
142 }
138 auto It = WorklistMap.find(N);
139 if (It == WorklistMap.end())
140 return; // Not in the worklist.
141
142 // Null out the entry rather than erasing it to avoid a linear operation.
143 Worklist[It->second] = nullptr;
144 WorklistMap.erase(It);
145 }
146
147 bool recursivelyDeleteUnusedNodes(SDNode *N);
143148
144149 SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
145150 bool AddTo = true);
10711076 return false;
10721077 }
10731078
1079 /// \brief Recursively delete a node which has no uses and any operands for
1080 /// which it is the only use.
1081 ///
1082 /// Note that this both deletes the nodes and removes them from the worklist.
1083 /// It also adds any nodes who have had a user deleted to the worklist as they
1084 /// may now have only one use and subject to other combines.
1085 bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
1086 if (!N->use_empty())
1087 return false;
1088
1089 SmallSetVector Nodes;
1090 Nodes.insert(N);
1091 do {
1092 N = Nodes.pop_back_val();
1093 if (!N)
1094 continue;
1095
1096 if (N->use_empty()) {
1097 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
1098 Nodes.insert(N->getOperand(i).getNode());
1099
1100 removeFromWorklist(N);
1101 DAG.DeleteNode(N);
1102 } else {
1103 AddToWorklist(N);
1104 }
1105 } while (!Nodes.empty());
1106 return true;
1107 }
10741108
10751109 //===----------------------------------------------------------------------===//
10761110 // Main DAG Combiner implementation
10981132
10991133 // while the worklist isn't empty, find a node and
11001134 // try and combine it.
1101 while (!WorklistContents.empty()) {
1135 while (!WorklistMap.empty()) {
11021136 SDNode *N;
1103 // The WorklistOrder holds the SDNodes in order, but it may contain
1104 // duplicates.
1105 // In order to avoid a linear scan, we use a set (O(log N)) to hold what the
1106 // worklist *should* contain, and check the node we want to visit is should
1107 // actually be visited.
1137 // The Worklist holds the SDNodes in order, but it may contain null entries.
11081138 do {
1109 N = WorklistOrder.pop_back_val();
1110 } while (!WorklistContents.erase(N));
1139 N = Worklist.pop_back_val();
1140 } while (!N);
1141
1142 bool GoodWorklistEntry = WorklistMap.erase(N);
1143 (void)GoodWorklistEntry;
1144 assert(GoodWorklistEntry &&
1145 "Found a worklist entry without a corresponding map entry!");
11111146
11121147 // If N has no uses, it is dead. Make sure to revisit all N's operands once
11131148 // N is deleted from the DAG, since they too may now be dead or may have a
11141149 // reduced number of uses, allowing other xforms.
1115 if (N->use_empty()) {
1116 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
1117 AddToWorklist(N->getOperand(i).getNode());
1118
1119 DAG.DeleteNode(N);
1150 if (recursivelyDeleteUnusedNodes(N))
11201151 continue;
1121 }
1152
1153 WorklistRemover DeadNodes(*this);
11221154
11231155 SDValue RV = combine(N);
11241156
11461178
11471179 // Transfer debug value.
11481180 DAG.TransferDbgValues(SDValue(N, 0), RV);
1149 WorklistRemover DeadNodes(*this);
11501181 if (N->getNumValues() == RV.getNode()->getNumValues())
11511182 DAG.ReplaceAllUsesWith(N, RV.getNode());
11521183 else {
11601191 AddToWorklist(RV.getNode());
11611192 AddUsersToWorklist(RV.getNode());
11621193
1163 // Add any uses of the old node to the worklist in case this node is the
1164 // last one that uses them. They may become dead after this node is
1165 // deleted.
1166 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
1167 AddToWorklist(N->getOperand(i).getNode());
1168
11691194 // Finally, if the node is now dead, remove it from the graph. The node
11701195 // may not be dead if the replacement process recursively simplified to
1171 // something else needing this node.
1172 if (N->use_empty()) {
1173 // Nodes can be reintroduced into the worklist. Make sure we do not
1174 // process a node that has been replaced.
1175 removeFromWorklist(N);
1176
1177 // Finally, since the node is now dead, remove it from the graph.
1178 DAG.DeleteNode(N);
1179 }
1196 // something else needing this node. This will also take care of adding any
1197 // operands which have lost a user to the worklist.
1198 recursivelyDeleteUnusedNodes(N);
11801199 }
11811200
11821201 // If the root changed (e.g. it was a dead load, update the root).
166166 define void @test_varsize(...) minsize {
167167 ; CHECK-T1-LABEL: test_varsize:
168168 ; CHECK-T1: sub sp, #16
169 ; CHECK-T1: push {r2, r3, r4, r5, r7, lr}
170 ; ...
171 ; CHECK-T1: pop {r2, r3, r4, r5, r7}
169 ; CHECK-T1: push {r5, r6, r7, lr}
170 ; ...
171 ; CHECK-T1: pop {r2, r3, r7}
172172 ; CHECK-T1: pop {r3}
173173 ; CHECK-T1: add sp, #16
174174 ; CHECK-T1: bx r3
88
99 define signext i8 @test1(i32 %A) {
1010 ; CHECK: test1
11 ; CHECK: sxtb r0, r0, ror #8
11 ; CHECK: lsr r0, r0, #8
12 ; CHECK: sxtb r0, r0
1213 %B = lshr i32 %A, 8
1314 %C = shl i32 %A, 24
1415 %D = or i32 %B, %C
2323 }
2424
2525 ; CHECK-LABEL: foo:
26 ; CHECK: lfd 1
27 ; CHECK: lfd 2
2628 ; CHECK: lfd 3
2729 ; CHECK: lfd 4
28 ; CHECK: lfd 1
29 ; CHECK: lfd 2
3030
3131 define { float, float } @oof() nounwind {
3232 entry:
3434 br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
3535
3636 ; CHECK: .LBB0_7:
37 ; CHECK: beq 1, .LBB0_10
37 ; CHECK: bne 1, .LBB0_10
3838 ; CHECK: beq 0, .LBB0_10
3939 ; CHECK: .LBB0_9:
4040
22 ;CHECK: EXPORT T{{[0-9]}}.XYZW
33 ;CHECK: EXPORT T{{[0-9]}}.0000
44 ;CHECK: EXPORT T{{[0-9]}}.0000
5 ;CHECK: EXPORT T{{[0-9]}}.0XZW
5 ;CHECK: EXPORT T{{[0-9]}}.0XYZ
66 ;CHECK: EXPORT T{{[0-9]}}.XYZW
7 ;CHECK: EXPORT T{{[0-9]}}.YX00
7 ;CHECK: EXPORT T{{[0-9]}}.YZ00
88 ;CHECK: EXPORT T{{[0-9]}}.0000
99 ;CHECK: EXPORT T{{[0-9]}}.0000
1010
9393
9494 ; EG-CHECK: @main2
9595 ; EG-CHECK: T{{[0-9]+}}.XY__
96 ; EG-CHECK: T{{[0-9]+}}.YXZ0
96 ; EG-CHECK: T{{[0-9]+}}.ZXY0
9797
9898 define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
9999 main_body:
99
1010 define signext i8 @test1(i32 %A) {
1111 ; CHECK: test1
12 ; CHECK: sxtb.w r0, r0, ror #8
12 ; CHECK: lsrs r0, r0, #8
13 ; CHECK: sxtb r0, r0
1314 %B = lshr i32 %A, 8
1415 %C = shl i32 %A, 24
1516 %D = or i32 %B, %C
2424
2525 define zeroext i32 @test3(i32 %A.u) {
2626 ; A8: test3
27 ; A8: uxth.w r0, r0, ror #8
27 ; A8: ubfx r0, r0, #8, #16
2828 %B.u = lshr i32 %A.u, 8
2929 %C.u = shl i32 %A.u, 24
3030 %D.u = or i32 %B.u, %C.u
+0
-14
test/CodeGen/X86/avx512-zext-load-crash.ll less more
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
1
2 define <8 x i16> @test_zext_load() {
3 ; CHECK: vmovq
4 entry:
5 %0 = load <2 x i16> ** undef, align 8
6 %1 = getelementptr inbounds <2 x i16>* %0, i64 1
7 %2 = load <2 x i16>* %0, align 1
8 %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32>
9 %4 = load <2 x i16>* %1, align 1
10 %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32>
11 %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32>
12 ret <8 x i16> %6
13 }
234234
235235 exit:
236236 ret i32 %base
237 }
238
239 define void @test_loop_rotate_reversed_blocks() {
240 ; This test case (greatly reduced from an Olden bencmark) ensures that the loop
241 ; rotate implementation doesn't assume that loops are laid out in a particular
242 ; order. The first loop will get split into two basic blocks, with the loop
243 ; header coming after the loop latch.
244 ;
245 ; CHECK: test_loop_rotate_reversed_blocks
246 ; CHECK: %entry
247 ; Look for a jump into the middle of the loop, and no branches mid-way.
248 ; CHECK: jmp
249 ; CHECK: %loop1
250 ; CHECK-NOT: j{{\w*}} .LBB{{.*}}
251 ; CHECK: %loop1
252 ; CHECK: je
253
254 entry:
255 %cond1 = load volatile i1* undef
256 br i1 %cond1, label %loop2.preheader, label %loop1
257
258 loop1:
259 call i32 @f()
260 %cond2 = load volatile i1* undef
261 br i1 %cond2, label %loop2.preheader, label %loop1
262
263 loop2.preheader:
264 call i32 @f()
265 %cond3 = load volatile i1* undef
266 br i1 %cond3, label %exit, label %loop2
267
268 loop2:
269 call i32 @f()
270 %cond4 = load volatile i1* undef
271 br i1 %cond4, label %exit, label %loop2
272
273 exit:
274 ret void
275237 }
276238
277239 define i32 @test_loop_align(i32 %i, i32* %a) {
3030 ; CHECK-LABEL: test3:
3131 ; CHECK: movzbl 8(%esp), %eax
3232 ; CHECK-NEXT: imull $171, %eax
33 ; CHECK-NEXT: andl $65024, %eax
3334 ; CHECK-NEXT: shrl $9, %eax
3435 ; CHECK-NEXT: ret
3536 }
5556 %div = sdiv i16 %x, 10
5657 ret i16 %div
5758 ; CHECK-LABEL: test6:
58 ; CHECK: imull $26215, %eax, %ecx
59 ; CHECK: sarl $18, %ecx
60 ; CHECK: shrl $15, %eax
59 ; CHECK: imull $26215, %eax
60 ; CHECK: movl %eax, %ecx
61 ; CHECK: shrl $31, %ecx
62 ; CHECK: sarl $18, %eax
6163 }
6264
6365 define i32 @test7(i32 %x) nounwind {
+0
-117
test/CodeGen/X86/fold-pcmpeqd-0.ll less more
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s
1 ; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s
2
3 ; i386 test has been disabled when scheduler 2-addr hack is disabled.
4
5 ; This testcase shouldn't need to spill the -1 value,
6 ; so it should just use pcmpeqd to materialize an all-ones vector.
7 ; For i386, cp load of -1 are folded.
8
9 ; With -regalloc=greedy, the live range is split before spilling, so the first
10 ; pcmpeq doesn't get folded as a constant pool load.
11
12 ; I386-NOT: pcmpeqd
13 ; I386: orps LCPI0_2, %xmm
14 ; I386-NOT: pcmpeqd
15 ; I386: orps LCPI0_2, %xmm
16
17 ; X86-64: pcmpeqd
18 ; X86-64-NOT: pcmpeqd
19
20 %struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }>
21 %struct._cl_image_format_t = type <{ i32, i32, i32 }>
22 %struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }>
23
24 define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind {
25 entry:
26 %tmp3.i = load i32* null ; [#uses=1]
27 %cmp = icmp sgt i32 %tmp3.i, 200 ; [#uses=1]
28 br i1 %cmp, label %forcond, label %ifthen
29
30 ifthen: ; preds = %entry
31 ret void
32
33 forcond: ; preds = %entry
34 %tmp3.i536 = load i32* null ; [#uses=1]
35 %cmp12 = icmp slt i32 0, %tmp3.i536 ; [#uses=1]
36 br i1 %cmp12, label %forbody, label %afterfor
37
38 forbody: ; preds = %forcond
39 %bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]
40 %mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer ; <<4 x float>> [#uses=1]
41 %mul257 = fmul <4 x float> %mul233, zeroinitializer ; <<4 x float>> [#uses=1]
42 %mul275 = fmul <4 x float> %mul257, zeroinitializer ; <<4 x float>> [#uses=1]
43 %tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
44 %bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]
45 %bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1]
46 %tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind ; <<4 x i32>> [#uses=1]
47 %tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind ; <<4 x float>> [#uses=1]
48 %sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70 ; <<4 x float>> [#uses=2]
49 %mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78 ; <<4 x float>> [#uses=1]
50 %add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 > ; <<4 x float>> [#uses=1]
51 %mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78 ; <<4 x float>> [#uses=1]
52 %add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]
53 %bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32> ; <<4 x i32>> [#uses=1]
54 %andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer ; <<4 x i32>> [#uses=1]
55 %bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float> ; <<4 x float>> [#uses=1]
56 %mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer ; <<4 x float>> [#uses=1]
57 %bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32> ; <<4 x i32>> [#uses=1]
58 %andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer ; <<4 x i32>> [#uses=1]
59 %xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
60 %orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102 ; <<4 x i32>> [#uses=1]
61 %bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float> ; <<4 x float>> [#uses=1]
62 %cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind ; <<4 x float>> [#uses=1]
63 %tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
64 %sub140.i = fsub <4 x float> zeroinitializer, %tmp80 ; <<4 x float>> [#uses=1]
65 %bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
66 %andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 > ; <<4 x i32>> [#uses=0]
67 %mul171.i = fmul <4 x float> zeroinitializer, %sub140.i ; <<4 x float>> [#uses=1]
68 %add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1]
69 %bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32> ; <<4 x i32>> [#uses=1]
70 %andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer ; <<4 x i32>> [#uses=1]
71 %bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float> ; <<4 x float>> [#uses=1]
72 %mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer ; <<4 x float>> [#uses=1]
73 %bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0]
74 %bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32> ; <<4 x i32>> [#uses=1]
75 %andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer ; <<4 x i32>> [#uses=1]
76 %bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32> ; <<4 x i32>> [#uses=1]
77 %xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
78 %orps203.i = or <4 x i32> %andnps192.i, %xorps.i ; <<4 x i32>> [#uses=1]
79 %bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float> ; <<4 x float>> [#uses=1]
80 %mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer ; <<4 x float>> [#uses=1]
81 %mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer ; <<4 x float>> [#uses=2]
82 %mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer ; <<4 x float>> [#uses=1]
83 %tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
84 %bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32> ; <<4 x i32>> [#uses=1]
85 %andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer ; <<4 x i32>> [#uses=1]
86 %orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer ; <<4 x i32>> [#uses=1]
87 %bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float> ; <<4 x float>> [#uses=1]
88 %tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1]
89 %bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32> ; <<4 x i32>> [#uses=1]
90 %bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=2]
91 %andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4 ; <<4 x i32>> [#uses=1]
92 %bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32> ; <<4 x i32>> [#uses=1]
93 %not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1]
94 %andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7 ; <<4 x i32>> [#uses=1]
95 %orps.i9 = or <4 x i32> %andnps.i8, %andps.i5 ; <<4 x i32>> [#uses=1]
96 %bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float> ; <<4 x float>> [#uses=1]
97 %bitcast.i = bitcast <4 x float> %mul313 to <4 x i32> ; <<4 x i32>> [#uses=1]
98 %andps.i = and <4 x i32> %bitcast.i, zeroinitializer ; <<4 x i32>> [#uses=1]
99 %orps.i = or <4 x i32> zeroinitializer, %andps.i ; <<4 x i32>> [#uses=1]
100 %bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float> ; <<4 x float>> [#uses=1]
101 call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
102 unreachable
103
104 afterfor: ; preds = %forcond
105 ret void
106 }
107
108 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
109
110 declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
111
112 declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
113
114 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
115
116 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
2929 ret void
3030 }
3131
32
33 ; DAGCombiner shouldn't fold the sdiv (ashr) away.
34 ; rdar://8636812
35 ; CHECK-LABEL: test2:
36 ; CHECK: sarl
37
38 define i32 @test2() nounwind {
39 entry:
40 %i = alloca i32, align 4
41 %j = alloca i8, align 1
42 store i32 127, i32* %i, align 4
43 store i8 0, i8* %j, align 1
44 %tmp3 = load i32* %i, align 4
45 %mul = mul nsw i32 %tmp3, 2
46 %conv4 = trunc i32 %mul to i8
47 %conv5 = sext i8 %conv4 to i32
48 %div6 = sdiv i32 %conv5, 2
49 %conv7 = trunc i32 %div6 to i8
50 %conv9 = sext i8 %conv7 to i32
51 %cmp = icmp eq i32 %conv9, -1
52 br i1 %cmp, label %if.then, label %if.end
53
54 if.then: ; preds = %entry
55 ret i32 0
56
57 if.end: ; preds = %entry
58 call void @abort() noreturn
59 unreachable
60 }
61
62 declare void @abort() noreturn
63
64 declare void @exit(i32) noreturn
65
6632 ; DAG Combiner can't fold this into a load of the 1'th byte.
6733 ; PR8757
6834 define i32 @test3(i32 *%P) nounwind ssp {
3333 ; X64: movb %sil, 1(%rdi)
3434
3535 ; X32-LABEL: test2:
36 ; X32: movb 8(%esp), %[[REG:[abcd]l]]
37 ; X32: movb %[[REG]], 1(%{{.*}})
36 ; X32: movzbl 8(%esp), %e[[REG:[abcd]]]x
37 ; X32: movb %[[REG]]l, 1(%{{.*}})
3838 }
3939
4040 define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
33 ; CHECK-LABEL: @t1
44 ; CHECK: movl 4(%esp), %[[R0:e[abcd]x]]
55 ; CHECK-NEXT: movl 8(%esp), %[[R1:e[abcd]x]]
6 ; CHECK-NEXT: movl 12(%[[R1]]), %[[R2:e[abcd]x]]
7 ; CHECK-NEXT: movl %[[R2]], (%[[R0]])
6 ; CHECK-NEXT: movss 12(%[[R1]]), %[[R2:xmm.*]]
7 ; CHECK-NEXT: movss %[[R2]], (%[[R0]])
88 ; CHECK-NEXT: retl
99
1010 %X = load <4 x float>* %P1