llvm.org GIT mirror llvm / d24d326
[SDAG] Introduce a combined set to the DAG combiner which tracks nodes which have successfully round-tripped through the combine phase, and use this to ensure all operands to DAG nodes are visited by the combiner, even if they are only added during the combine phase. This is critical to have the combiner reach nodes that are *introduced* during combining. Previously these would sometimes be visited and sometimes not be visited based on whether they happened to end up on the worklist or not. Now we always run them through the combiner. This fixes quite a few bad codegen test cases lurking in the suite while also being more principled. Among these, the TLS codegeneration is particularly exciting for programs that have this in the critical path like TSan-instrumented binaries (although I think they engineer to use a different TLS that is faster anyways). I've tried to check for compile-time regressions here by running llc over a merged (but not LTO-ed) clang bitcode file and observed at most a 3% slowdown in llc. Given that this is essentially a worst case (none of opt or clang are running at this phase) I think this is tolerable. The actual LTO case should be even less costly, and the cost in normal compilation should be negligible. With this combining logic, it is possible to re-legalize as we combine which is necessary to implement PSHUFB formation on x86 as a post-legalize DAG combine (my ultimate goal). Differential Revision: http://reviews.llvm.org/D4638 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213898 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 5 years ago
19 changed file(s) with 101 addition(s) and 143 deletion(s). Raw diff Collapse all Expand all
103103 /// stable indices of nodes within the worklist.
104104 DenseMap WorklistMap;
105105
106 /// \brief Set of nodes which have been combined (at least once).
107 ///
108 /// This is used to allow us to reliably add any operands of a DAG node
109 /// which have not yet been combined to the worklist.
110 SmallPtrSet CombinedNodes;
111
106112 // AA - Used for DAG load/store alias analysis.
107113 AliasAnalysis &AA;
108114
135141 /// removeFromWorklist - remove all instances of N from the worklist.
136142 ///
137143 void removeFromWorklist(SDNode *N) {
144 CombinedNodes.erase(N);
145
138146 auto It = WorklistMap.find(N);
139147 if (It == WorklistMap.end())
140148 return; // Not in the worklist.
11511159 if (recursivelyDeleteUnusedNodes(N))
11521160 continue;
11531161
1162 DEBUG(dbgs() << "\nCombining: ";
1163 N->dump(&DAG));
1164
1165 // Add any operands of the new node which have not yet been combined to the
1166 // worklist as well. Because the worklist uniques things already, this
1167 // won't repeatedly process the same operand.
1168 CombinedNodes.insert(N);
1169 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
1170 if (!CombinedNodes.count(N->getOperand(i).getNode()))
1171 AddToWorklist(N->getOperand(i).getNode());
1172
11541173 WorklistRemover DeadNodes(*this);
11551174
11561175 SDValue RV = combine(N);
11711190 RV.getNode()->getOpcode() != ISD::DELETED_NODE &&
11721191 "Node was deleted but visit returned new node!");
11731192
1174 DEBUG(dbgs() << "\nReplacing.3 ";
1175 N->dump(&DAG);
1176 dbgs() << "\nWith: ";
1177 RV.getNode()->dump(&DAG);
1178 dbgs() << '\n');
1193 DEBUG(dbgs() << " ... into: ";
1194 RV.getNode()->dump(&DAG));
11791195
11801196 // Transfer debug value.
11811197 DAG.TransferDbgValues(SDValue(N, 0), RV);
+0
-46
test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll less more
None ; RUN: llc -O3 < %s | FileCheck %s
1 ; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
2 ; Test case for a DAG combiner bug where we combined an indexed load
3 ; with an extension (sext, zext, or any) into a regular extended load,
4 ; i.e., dropping the indexed value.
5 ;
6
7 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
8 target triple = "arm64-apple-ios"
9
10 %class.A = type { i64, i64 }
11 %class.C = type { i64 }
12
13 ; CHECK-LABEL: XX:
14 ; CHECK: ldr
15 define i32 @XX(%class.A* %K, i1 %tst, i32* %addr, %class.C** %ppC, %class.C* %pC) {
16 entry:
17 br i1 %tst, label %if.then, label %lor.rhs.i
18
19 lor.rhs.i: ; preds = %entry
20 %tmp = load i32* %addr, align 4
21 %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
22 %tmp1 = load i64* %y.i.i.i, align 8
23 %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
24 %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
25 %add12.i = add nsw i32 0, %div11.i
26 %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
27 %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
28 %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
29 %add16.i = add nsw i32 %add12.i, %div15.i
30 %rem.i.i = srem i32 %add16.i, %tmp
31 %idxprom = sext i32 %rem.i.i to i64
32 %arrayidx = getelementptr inbounds %class.C** %ppC, i64 %idxprom
33 %tobool533 = icmp eq %class.C* %pC, null
34 br i1 %tobool533, label %while.end, label %while.body
35
36 if.then: ; preds = %entry
37 ret i32 42
38
39 while.body: ; preds = %lor.rhs.i
40 ret i32 5
41
42 while.end: ; preds = %lor.rhs.i
43 %tmp3 = load %class.C** %arrayidx, align 8
44 ret i32 50
45 }
9191 call arm_aapcs_vfpcc void @test_1double_misaligned([4 x double] undef, [4 x double] undef, float undef, double 1.0)
9292
9393 ; CHECK-LABEL: test_1double_misaligned:
94 ; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
9495 ; CHECK-DAG: mov [[ONELO:r[0-9]+]], #0
95 ; CHECK-DAG: mov r[[BASE:[0-9]+]], sp
96 ; CHECK-DAG: movw [[ONEHI:r[0-9]+]], #0
9796 ; CHECK-DAG: movt [[ONEHI]], #16368
98 ; CHECK-DAG: str [[ONELO]], [r[[BASE]], #8]!
99 ; CHECK-DAG: str [[ONEHI]], [r[[BASE]], #4]
97 ; CHECK-DAG: strd [[ONELO]], [[ONEHI]], [sp, #8]
10098
10199 ; CHECK-M4F-LABEL: test_1double_misaligned:
102100 ; CHECK-M4F: movs [[ONELO:r[0-9]+]], #0
756756
757757 ; ALL-LABEL: slti6:
758758
759 ; 32-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
760 ; 32-CMOV-DAG: xori [[R1]], [[R1]], 1
761 ; 32-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
762 ; 32-CMOV-NOT: movn
763
764 ; 32-CMP-DAG: slti [[R1:\$[0-9]+]], $4, 7
765 ; 32-CMP-DAG: xori [[R1]], [[R1]], 1
766 ; 32-CMP-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
767 ; 32-CMP-NOT: seleqz
768 ; 32-CMP-NOT: selnez
769
770 ; 64-CMOV-DAG: slti [[R1:\$[0-9]+]], $4, 7
771 ; 64-CMOV-DAG: xori [[R1]], [[R1]], 1
772 ; 64-CMOV-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
773 ; 64-CMOV-NOT: movn
774
775 ; 64-CMP-DAG: slti [[R1:\$[0-9]+]], $4, 7
776 ; 64-CMP-DAG: xori [[R1]], [[R1]], 1
777 ; 64-CMP-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
778 ; 64-CMP-NOT: seleqz
779 ; 64-CMP-NOT: selnez
759 ; ALL-DAG: addiu [[R1:\$[0-9]+]], $zero, 6
760 ; ALL-DAG: slt [[R1]], [[R1]], $4
761 ; ALL-DAG: addiu [[R2:\$[0-9]+]], [[R1]], 3
762 ; ALL-NOT: movn
763 ; ALL-NOT: seleqz
764 ; ALL-NOT: selnez
6969 }
7070
7171 ; SI-LABEL: @trunc_i64_add_to_i32
72 ; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
73 ; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
72 ; SI: S_LOAD_DWORD s[[SREG0:[0-9]+]]
73 ; SI: S_LOAD_DWORD s[[SREG1:[0-9]+]]
7474 ; SI: S_ADD_I32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
7575 ; SI-NOT: ADDC
7676 ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
115115 }
116116
117117 ; SI-LABEL: @trunc_i64_or_to_i32
118 ; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG0:[0-9]+]]
119 ; SI: S_LOAD_DWORDX2 s{{\[}}[[SREG1:[0-9]+]]
120 ; SI: S_OR_B32 [[SRESULT:s[0-9]+]], s[[SREG1]], s[[SREG0]]
121 ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
118 ; SI: S_LOAD_DWORD s[[SREG0:[0-9]+]]
119 ; SI: S_LOAD_DWORD s[[SREG1:[0-9]+]]
120 ; SI: S_OR_B32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
121 ; SI: V_MOV_B32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
122122 ; SI: BUFFER_STORE_DWORD [[VRESULT]],
123123 define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
124124 %add = or i64 %b, %a
66 %tmp1 = bitcast double %a to <8 x i8>
77 %tmp2 = bitcast double %b to <8 x i8>
88 %tmp3 = add <8 x i8> %tmp1, %tmp2
9 ; CHECK: paddw
9 ; CHECK: paddb
1010 store <8 x i8> %tmp3, <8 x i8>* null
1111 ret void
1212 }
1717 %tmp1 = bitcast double %a to <4 x i16>
1818 %tmp2 = bitcast double %b to <4 x i16>
1919 %tmp3 = add <4 x i16> %tmp1, %tmp2
20 ; CHECK: paddd
20 ; CHECK: paddw
2121 store <4 x i16> %tmp3, <4 x i16>* null
2222 ret void
2323 }
2828 %tmp1 = bitcast double %a to <2 x i32>
2929 %tmp2 = bitcast double %b to <2 x i32>
3030 %tmp3 = add <2 x i32> %tmp1, %tmp2
31 ; CHECK: paddq
31 ; CHECK: paddd
3232 store <2 x i32> %tmp3, <2 x i32>* null
3333 ret void
3434 }
22
33 declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
44 define i8 @testumulo(i32 %argc) {
5 ; CHECK: imulw
5 ; CHECK: imull
66 ; CHECK: testb %{{.+}}, %{{.+}}
77 ; CHECK: je [[NOOVERFLOWLABEL:.+]]
88 ; CHECK: {{.*}}[[NOOVERFLOWLABEL]]:
283283 define i32 @func_test1(i32 %p1) nounwind uwtable {
284284 entry:
285285 ; CHECK-LABEL: func_test1:
286 ; CHECK: testb
286 ; CHECK: andb
287287 ; CHECK: j
288288 ; CHECK: ret
289289 %0 = load i32* @b, align 4
6767 %2 = bitcast <2 x i32> %add to i64
6868 ret i64 %2
6969 }
70 ; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
70 ; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd.
7171 ; Ideally, we should fold that sequence into a single paddd. This is fixed with
7272 ; the widening legalization.
7373 ;
7474 ; CHECK-LABEL: test4
7575 ; CHECK: pshufd
76 ; CHECK-NEXT: paddq
76 ; CHECK-NEXT: paddd
7777 ; CHECK-NEXT: pshufd
7878 ; CHECK: ret
7979 ;
4747
4848 ; CHECK: test3
4949 ; CHECK: movzbl
50 ; CHECK: shrl
51 ; CHECK: andl $1
52 ; CHECK: andl $1
53 ; CHECK: vmovd
54 ; CHECK: pinsrd $1
55 ; CHECK: shrl $2
56 ; CHECK: andl $1
57 ; CHECK: pinsrd $2
58 ; CHECK: shrl $3
59 ; CHECK: andl $1
60 ; CHECK: pinsrd $3
61 ; CHECK: pslld
62 ; CHECK: psrad
63 ; CHECK: pmovsxdq
64 ; CHECK: pmovsxdq
50 ; CHECK: movq
51 ; CHECK: shlq
52 ; CHECK: sarq
53 ; CHECK: vmovq
54 ; CHECK: movq
55 ; CHECK: shlq
56 ; CHECK: sarq
57 ; CHECK: vmovq
58 ; CHECK: vpunpcklqdq
59 ; CHECK: movq
60 ; CHECK: shlq
61 ; CHECK: sarq
62 ; CHECK: vmovq
63 ; CHECK: shlq
64 ; CHECK: sarq
65 ; CHECK: vmovq
66 ; CHECK: vpunpcklqdq
67 ; CHECK: vinsertf128
6568 ; CHECK: ret
3333 ; X64: movb %sil, 1(%rdi)
3434
3535 ; X32-LABEL: test2:
36 ; X32: movzbl 8(%esp), %e[[REG:[abcd]]]x
36 ; X32: movb 8(%esp), %[[REG:[abcd]]]l
3737 ; X32: movb %[[REG]]l, 1(%{{.*}})
3838 }
3939
6666 ; X64: movw %si, 2(%rdi)
6767
6868 ; X32-LABEL: test4:
69 ; X32: movl 8(%esp), %e[[REG:[abcd]x]]
70 ; X32: movw %[[REG]], 2(%{{.*}})
69 ; X32: movw 8(%esp), %[[REG:[abcd]]]x
70 ; X32: movw %[[REG]]x, 2(%{{.*}})
7171 }
7272
7373 define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
8383 ; X64: movw %si, 2(%rdi)
8484
8585 ; X32-LABEL: test5:
86 ; X32: movzwl 8(%esp), %e[[REG:[abcd]x]]
87 ; X32: movw %[[REG]], 2(%{{.*}})
86 ; X32: movw 8(%esp), %[[REG:[abcd]]]x
87 ; X32: movw %[[REG]]x, 2(%{{.*}})
8888 }
8989
9090 define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
3131
3232 ;CHECK-LABEL: load_2_i32:
3333 ;CHECK: pmovzxdq
34 ;CHECK: paddq
34 ;CHECK: paddd
3535 ;CHECK: pshufd
3636 ;CHECK: ret
3737 define void @load_2_i32(<2 x i32>* %A) {
5555
5656 ;CHECK-LABEL: load_4_i16:
5757 ;CHECK: pmovzxwd
58 ;CHECK: paddd
58 ;CHECK: paddw
5959 ;CHECK: pshufb
6060 ;CHECK: ret
6161 define void @load_4_i16(<4 x i16>* %A) {
6767
6868 ;CHECK-LABEL: load_8_i8:
6969 ;CHECK: pmovzxbw
70 ;CHECK: paddw
70 ;CHECK: paddb
7171 ;CHECK: pshufb
7272 ;CHECK: ret
7373 define void @load_8_i8(<8 x i8>* %A) {
121121 ; SSE41-LABEL: test8:
122122 ; SSE41: pmuldq
123123 ; SSE41: pshufd $49
124 ; SSE41-NOT: pshufd $49
124 ; SSE41: pshufd $49
125125 ; SSE41: pmuldq
126126 ; SSE41: shufps $-35
127127 ; SSE41: pshufd $-40
133133 ; SSE-LABEL: test8:
134134 ; SSE: pmuludq
135135 ; SSE: pshufd $49
136 ; SSE-NOT: pshufd $49
136 ; SSE: pshufd $49
137137 ; SSE: pmuludq
138138 ; SSE: shufps $-35
139139 ; SSE: pshufd $-40
146146 ; AVX-LABEL: test8:
147147 ; AVX: vpmuldq
148148 ; AVX: vpshufd $49
149 ; AVX-NOT: vpshufd $49
149 ; AVX: vpshufd $49
150150 ; AVX: vpmuldq
151151 ; AVX: vshufps $-35
152152 ; AVX: vpshufd $-40
161161 ret <8 x i32> %div
162162
163163 ; AVX-LABEL: test9:
164 ; AVX: vpalignr $4
165 ; AVX: vpbroadcastd
166 ; AVX: vpmuldq
167 ; AVX: vpmuldq
164 ; AVX: vpbroadcastd
165 ; AVX: vpalignr $4
166 ; AVX: vpalignr $4
167 ; AVX: vpmuldq
168 ; AVX: vpmuldq
169 ; AVX: vpalignr $4
168170 ; AVX: vpblendd $170
169171 ; AVX: vpadd
170172 ; AVX: vpsrld $31
194196 ret <8 x i32> %rem
195197
196198 ; AVX-LABEL: test11:
197 ; AVX: vpalignr $4
198 ; AVX: vpbroadcastd
199 ; AVX: vpmuldq
200 ; AVX: vpmuldq
199 ; AVX: vpbroadcastd
200 ; AVX: vpalignr $4
201 ; AVX: vpalignr $4
202 ; AVX: vpmuldq
203 ; AVX: vpmuldq
204 ; AVX: vpalignr $4
201205 ; AVX: vpblendd $170
202206 ; AVX: vpadd
203207 ; AVX: vpsrld $31
11 ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
22
33 ; CHECK: movl
4 ; CHECK: paddd
4 ; CHECK: paddw
55 ; CHECK: movlpd
66
77 ; Scheduler causes produce a different instruction order
88 ; ATOM: movl
9 ; ATOM: paddd
9 ; ATOM: paddw
1010 ; ATOM: movlpd
1111
1212 ; bitcast a v4i16 to v2i32
0 ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
1 ; CHECK: paddq
1 ; CHECK: paddd
22
33 ; truncate v2i64 to v2i32
44
9090 %i16vec4 = type <4 x i16>
9191 define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
9292 ; CHECK-LABEL: add4i16:
93 ; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]]
94 ; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]]
95 ; CHECK-NEXT: paddd %[[R0]], %[[R1]]
96 ; CHECK-NEXT: pshufb {{.*}}, %[[R1]]
93 ; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]]
94 ; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]]
95 ; CHECK-NEXT: paddw %[[R0]], %[[R1]]
9796 ; CHECK-NEXT: movq %[[R1]], (%{{.*}})
9897 %a = load %i16vec4* %ap, align 16
9998 %b = load %i16vec4* %bp, align 16
0 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
11 @tm_nest_level = internal thread_local global i32 0
22 define i64 @z() nounwind {
3 ; FIXME: The codegen here is primitive at best and could be much better.
4 ; The add and the moves can be folded together.
5 ; CHECK-DAG: movq $tm_nest_level@TPOFF, %rcx
6 ; CHECK-DAG: movq %fs:0, %rax
7 ; CHECK: addl %ecx, %eax
3 ; CHECK: movq $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x
4 ; CHECK-NEXT: addl %fs:0, %e[[R0]]x
5 ; CHECK-NEXT: andq $100, %r[[R0]]x
6
87 ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
98 }
11
22 define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
33 ; CHECK-LABEL: LCPI0_0:
4 ; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00
5 ; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00
6 ; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00
7 ; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00
4 ; CHECK-NEXT: .long 1065353216 ## 0x3f800000
5 ; CHECK-NEXT: .long 1065353216 ## 0x3f800000
6 ; CHECK-NEXT: .long 1065353216 ## 0x3f800000
7 ; CHECK-NEXT: .long 1065353216 ## 0x3f800000
88 ; CHECK-LABEL: foo:
99 ; CHECK: cmpeqps %xmm1, %xmm0
1010 ; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0
5858 ; scalar value like what the zext creates.
5959 define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
6060 ; CHECK-LABEL: LCPI3_0:
61 ; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00
62 ; CHECK-NEXT: .long 0 ## float 0.000000e+00
63 ; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00
64 ; CHECK-NEXT: .long 0 ## float 0.000000e+00
61 ; CHECK-NEXT: .long 1065353216 ## 0x3f800000
62 ; CHECK-NEXT: .long 0 ## 0x0
63 ; CHECK-NEXT: .long 1065353216 ## 0x3f800000
64 ; CHECK-NEXT: .long 0 ## 0x0
6565 ; CHECK-LABEL: foo3:
6666 ; CHECK: cmpeqps %xmm1, %xmm0
6767 ; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0