llvm.org GIT mirror llvm / 2e6b938
[X86] Automatically generate stack folding tests. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369876 91177308-0d34-0410-b5e6-96231b3b80d8 Amaury Sechet 22 days ago
22 changed file(s) with 19920 addition(s) and 3754 deletion(s). Raw diff Collapse all Expand all
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+3dnow | FileCheck %s
12
23 define x86_mmx @stack_fold_pavgusb(x86_mmx %a, x86_mmx %b) {
3 ;CHECK-LABEL: stack_fold_pavgusb
4 ;CHECK: pavgusb {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
4 ; CHECK-LABEL: stack_fold_pavgusb:
5 ; CHECK: # %bb.0:
6 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
7 ; CHECK-NEXT: #APP
8 ; CHECK-NEXT: nop
9 ; CHECK-NEXT: #NO_APP
10 ; CHECK-NEXT: pavgusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
11 ; CHECK-NEXT: movq2dq %mm0, %xmm0
12 ; CHECK-NEXT: retq
513 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
614 %2 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a, x86_mmx %b) nounwind readnone
715 ret x86_mmx %2
917 declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
1018
1119 define x86_mmx @stack_fold_pf2id(x86_mmx %a) {
12 ;CHECK-LABEL: stack_fold_pf2id
13 ;CHECK: pf2id {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
20 ; CHECK-LABEL: stack_fold_pf2id:
21 ; CHECK: # %bb.0:
22 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
23 ; CHECK-NEXT: #APP
24 ; CHECK-NEXT: nop
25 ; CHECK-NEXT: #NO_APP
26 ; CHECK-NEXT: pf2id {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
27 ; CHECK-NEXT: movq2dq %mm0, %xmm0
28 ; CHECK-NEXT: retq
1429 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
1530 %2 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %a) nounwind readnone
1631 ret x86_mmx %2
1833 declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
1934
2035 define x86_mmx @stack_fold_pf2iw(x86_mmx %a) {
21 ;CHECK-LABEL: stack_fold_pf2iw
22 ;CHECK: pf2iw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
36 ; CHECK-LABEL: stack_fold_pf2iw:
37 ; CHECK: # %bb.0:
38 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
39 ; CHECK-NEXT: #APP
40 ; CHECK-NEXT: nop
41 ; CHECK-NEXT: #NO_APP
42 ; CHECK-NEXT: pf2iw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
43 ; CHECK-NEXT: movq2dq %mm0, %xmm0
44 ; CHECK-NEXT: retq
2345 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
2446 %2 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %a) nounwind readnone
2547 ret x86_mmx %2
2749 declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
2850
2951 define x86_mmx @stack_fold_pfacc(x86_mmx %a, x86_mmx %b) {
30 ;CHECK-LABEL: stack_fold_pfacc
31 ;CHECK: pfacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
52 ; CHECK-LABEL: stack_fold_pfacc:
53 ; CHECK: # %bb.0:
54 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
55 ; CHECK-NEXT: #APP
56 ; CHECK-NEXT: nop
57 ; CHECK-NEXT: #NO_APP
58 ; CHECK-NEXT: pfacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
59 ; CHECK-NEXT: movq2dq %mm0, %xmm0
60 ; CHECK-NEXT: retq
3261 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
3362 %2 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %a, x86_mmx %b) nounwind readnone
3463 ret x86_mmx %2
3665 declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
3766
3867 define x86_mmx @stack_fold_pfadd(x86_mmx %a, x86_mmx %b) {
39 ;CHECK-LABEL: stack_fold_pfadd
40 ;CHECK: pfadd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
68 ; CHECK-LABEL: stack_fold_pfadd:
69 ; CHECK: # %bb.0:
70 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
71 ; CHECK-NEXT: #APP
72 ; CHECK-NEXT: nop
73 ; CHECK-NEXT: #NO_APP
74 ; CHECK-NEXT: pfadd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
75 ; CHECK-NEXT: movq2dq %mm0, %xmm0
76 ; CHECK-NEXT: retq
4177 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
4278 %2 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %a, x86_mmx %b) nounwind readnone
4379 ret x86_mmx %2
4581 declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
4682
4783 define x86_mmx @stack_fold_pfcmpeq(x86_mmx %a, x86_mmx %b) {
48 ;CHECK-LABEL: stack_fold_pfcmpeq
49 ;CHECK: pfcmpeq {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
84 ; CHECK-LABEL: stack_fold_pfcmpeq:
85 ; CHECK: # %bb.0:
86 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
87 ; CHECK-NEXT: #APP
88 ; CHECK-NEXT: nop
89 ; CHECK-NEXT: #NO_APP
90 ; CHECK-NEXT: pfcmpeq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
91 ; CHECK-NEXT: movq2dq %mm0, %xmm0
92 ; CHECK-NEXT: retq
5093 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
5194 %2 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %a, x86_mmx %b) nounwind readnone
5295 ret x86_mmx %2
5497 declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
5598
5699 define x86_mmx @stack_fold_pfcmpge(x86_mmx %a, x86_mmx %b) {
57 ;CHECK-LABEL: stack_fold_pfcmpge
58 ;CHECK: pfcmpge {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
100 ; CHECK-LABEL: stack_fold_pfcmpge:
101 ; CHECK: # %bb.0:
102 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
103 ; CHECK-NEXT: #APP
104 ; CHECK-NEXT: nop
105 ; CHECK-NEXT: #NO_APP
106 ; CHECK-NEXT: pfcmpge {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
107 ; CHECK-NEXT: movq2dq %mm0, %xmm0
108 ; CHECK-NEXT: retq
59109 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
60110 %2 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %a, x86_mmx %b) nounwind readnone
61111 ret x86_mmx %2
63113 declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
64114
65115 define x86_mmx @stack_fold_pfcmpgt(x86_mmx %a, x86_mmx %b) {
66 ;CHECK-LABEL: stack_fold_pfcmpgt
67 ;CHECK: pfcmpgt {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
116 ; CHECK-LABEL: stack_fold_pfcmpgt:
117 ; CHECK: # %bb.0:
118 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
119 ; CHECK-NEXT: #APP
120 ; CHECK-NEXT: nop
121 ; CHECK-NEXT: #NO_APP
122 ; CHECK-NEXT: pfcmpgt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
123 ; CHECK-NEXT: movq2dq %mm0, %xmm0
124 ; CHECK-NEXT: retq
68125 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
69126 %2 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %a, x86_mmx %b) nounwind readnone
70127 ret x86_mmx %2
72129 declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
73130
74131 define x86_mmx @stack_fold_pfmax(x86_mmx %a, x86_mmx %b) {
75 ;CHECK-LABEL: stack_fold_pfmax
76 ;CHECK: pfmax {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
132 ; CHECK-LABEL: stack_fold_pfmax:
133 ; CHECK: # %bb.0:
134 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
135 ; CHECK-NEXT: #APP
136 ; CHECK-NEXT: nop
137 ; CHECK-NEXT: #NO_APP
138 ; CHECK-NEXT: pfmax {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
139 ; CHECK-NEXT: movq2dq %mm0, %xmm0
140 ; CHECK-NEXT: retq
77141 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
78142 %2 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %a, x86_mmx %b) nounwind readnone
79143 ret x86_mmx %2
81145 declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
82146
83147 define x86_mmx @stack_fold_pfmin(x86_mmx %a, x86_mmx %b) {
84 ;CHECK-LABEL: stack_fold_pfmin
85 ;CHECK: pfmin {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
148 ; CHECK-LABEL: stack_fold_pfmin:
149 ; CHECK: # %bb.0:
150 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
151 ; CHECK-NEXT: #APP
152 ; CHECK-NEXT: nop
153 ; CHECK-NEXT: #NO_APP
154 ; CHECK-NEXT: pfmin {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
155 ; CHECK-NEXT: movq2dq %mm0, %xmm0
156 ; CHECK-NEXT: retq
86157 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
87158 %2 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %a, x86_mmx %b) nounwind readnone
88159 ret x86_mmx %2
90161 declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
91162
92163 define x86_mmx @stack_fold_pfmul(x86_mmx %a, x86_mmx %b) {
93 ;CHECK-LABEL: stack_fold_pfmul
94 ;CHECK: pfmul {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
164 ; CHECK-LABEL: stack_fold_pfmul:
165 ; CHECK: # %bb.0:
166 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
167 ; CHECK-NEXT: #APP
168 ; CHECK-NEXT: nop
169 ; CHECK-NEXT: #NO_APP
170 ; CHECK-NEXT: pfmul {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
171 ; CHECK-NEXT: movq2dq %mm0, %xmm0
172 ; CHECK-NEXT: retq
95173 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
96174 %2 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %a, x86_mmx %b) nounwind readnone
97175 ret x86_mmx %2
99177 declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
100178
101179 define x86_mmx @stack_fold_pfnacc(x86_mmx %a, x86_mmx %b) {
102 ;CHECK-LABEL: stack_fold_pfnacc
103 ;CHECK: pfnacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
180 ; CHECK-LABEL: stack_fold_pfnacc:
181 ; CHECK: # %bb.0:
182 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
183 ; CHECK-NEXT: #APP
184 ; CHECK-NEXT: nop
185 ; CHECK-NEXT: #NO_APP
186 ; CHECK-NEXT: pfnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
187 ; CHECK-NEXT: movq2dq %mm0, %xmm0
188 ; CHECK-NEXT: retq
104189 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
105190 %2 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
106191 ret x86_mmx %2
108193 declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
109194
110195 define x86_mmx @stack_fold_pfpnacc(x86_mmx %a, x86_mmx %b) {
111 ;CHECK-LABEL: stack_fold_pfpnacc
112 ;CHECK: pfpnacc {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
196 ; CHECK-LABEL: stack_fold_pfpnacc:
197 ; CHECK: # %bb.0:
198 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
199 ; CHECK-NEXT: #APP
200 ; CHECK-NEXT: nop
201 ; CHECK-NEXT: #NO_APP
202 ; CHECK-NEXT: pfpnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
203 ; CHECK-NEXT: movq2dq %mm0, %xmm0
204 ; CHECK-NEXT: retq
113205 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
114206 %2 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
115207 ret x86_mmx %2
117209 declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
118210
119211 define x86_mmx @stack_fold_pfrcp(x86_mmx %a) {
120 ;CHECK-LABEL: stack_fold_pfrcp
121 ;CHECK: pfrcp {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
212 ; CHECK-LABEL: stack_fold_pfrcp:
213 ; CHECK: # %bb.0:
214 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
215 ; CHECK-NEXT: #APP
216 ; CHECK-NEXT: nop
217 ; CHECK-NEXT: #NO_APP
218 ; CHECK-NEXT: pfrcp {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
219 ; CHECK-NEXT: movq2dq %mm0, %xmm0
220 ; CHECK-NEXT: retq
122221 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
123222 %2 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %a) nounwind readnone
124223 ret x86_mmx %2
126225 declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
127226
128227 define x86_mmx @stack_fold_pfrcpit1(x86_mmx %a, x86_mmx %b) {
129 ;CHECK-LABEL: stack_fold_pfrcpit1
130 ;CHECK: pfrcpit1 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
228 ; CHECK-LABEL: stack_fold_pfrcpit1:
229 ; CHECK: # %bb.0:
230 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
231 ; CHECK-NEXT: #APP
232 ; CHECK-NEXT: nop
233 ; CHECK-NEXT: #NO_APP
234 ; CHECK-NEXT: pfrcpit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
235 ; CHECK-NEXT: movq2dq %mm0, %xmm0
236 ; CHECK-NEXT: retq
131237 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
132238 %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %a, x86_mmx %b) nounwind readnone
133239 ret x86_mmx %2
135241 declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
136242
137243 define x86_mmx @stack_fold_pfrcpit2(x86_mmx %a, x86_mmx %b) {
138 ;CHECK-LABEL: stack_fold_pfrcpit2
139 ;CHECK: pfrcpit2 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
244 ; CHECK-LABEL: stack_fold_pfrcpit2:
245 ; CHECK: # %bb.0:
246 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
247 ; CHECK-NEXT: #APP
248 ; CHECK-NEXT: nop
249 ; CHECK-NEXT: #NO_APP
250 ; CHECK-NEXT: pfrcpit2 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
251 ; CHECK-NEXT: movq2dq %mm0, %xmm0
252 ; CHECK-NEXT: retq
140253 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
141254 %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %a, x86_mmx %b) nounwind readnone
142255 ret x86_mmx %2
144257 declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
145258
146259 define x86_mmx @stack_fold_pfrsqit1(x86_mmx %a, x86_mmx %b) {
147 ;CHECK-LABEL: stack_fold_pfrsqit1
148 ;CHECK: pfrsqit1 {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
260 ; CHECK-LABEL: stack_fold_pfrsqit1:
261 ; CHECK: # %bb.0:
262 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
263 ; CHECK-NEXT: #APP
264 ; CHECK-NEXT: nop
265 ; CHECK-NEXT: #NO_APP
266 ; CHECK-NEXT: pfrsqit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
267 ; CHECK-NEXT: movq2dq %mm0, %xmm0
268 ; CHECK-NEXT: retq
149269 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
150270 %2 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %a, x86_mmx %b) nounwind readnone
151271 ret x86_mmx %2
153273 declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
154274
155275 define x86_mmx @stack_fold_pfrsqrt(x86_mmx %a) {
156 ;CHECK-LABEL: stack_fold_pfrsqrt
157 ;CHECK: pfrsqrt {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
276 ; CHECK-LABEL: stack_fold_pfrsqrt:
277 ; CHECK: # %bb.0:
278 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
279 ; CHECK-NEXT: #APP
280 ; CHECK-NEXT: nop
281 ; CHECK-NEXT: #NO_APP
282 ; CHECK-NEXT: pfrsqrt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
283 ; CHECK-NEXT: movq2dq %mm0, %xmm0
284 ; CHECK-NEXT: retq
158285 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
159286 %2 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %a) nounwind readnone
160287 ret x86_mmx %2
162289 declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
163290
164291 define x86_mmx @stack_fold_pfsub(x86_mmx %a, x86_mmx %b) {
165 ;CHECK-LABEL: stack_fold_pfsub
166 ;CHECK: pfsub {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
292 ; CHECK-LABEL: stack_fold_pfsub:
293 ; CHECK: # %bb.0:
294 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
295 ; CHECK-NEXT: #APP
296 ; CHECK-NEXT: nop
297 ; CHECK-NEXT: #NO_APP
298 ; CHECK-NEXT: pfsub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
299 ; CHECK-NEXT: movq2dq %mm0, %xmm0
300 ; CHECK-NEXT: retq
167301 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
168302 %2 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %a, x86_mmx %b) nounwind readnone
169303 ret x86_mmx %2
171305 declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
172306
173307 define x86_mmx @stack_fold_pfsubr(x86_mmx %a, x86_mmx %b) {
174 ;CHECK-LABEL: stack_fold_pfsubr
175 ;CHECK: pfsubr {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
308 ; CHECK-LABEL: stack_fold_pfsubr:
309 ; CHECK: # %bb.0:
310 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
311 ; CHECK-NEXT: #APP
312 ; CHECK-NEXT: nop
313 ; CHECK-NEXT: #NO_APP
314 ; CHECK-NEXT: pfsubr {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
315 ; CHECK-NEXT: movq2dq %mm0, %xmm0
316 ; CHECK-NEXT: retq
176317 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
177318 %2 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %a, x86_mmx %b) nounwind readnone
178319 ret x86_mmx %2
180321 declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
181322
182323 define x86_mmx @stack_fold_pi2fd(x86_mmx %a) {
183 ;CHECK-LABEL: stack_fold_pi2fd
184 ;CHECK: pi2fd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
324 ; CHECK-LABEL: stack_fold_pi2fd:
325 ; CHECK: # %bb.0:
326 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
327 ; CHECK-NEXT: #APP
328 ; CHECK-NEXT: nop
329 ; CHECK-NEXT: #NO_APP
330 ; CHECK-NEXT: pi2fd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
331 ; CHECK-NEXT: movq2dq %mm0, %xmm0
332 ; CHECK-NEXT: retq
185333 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
186334 %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %a) nounwind readnone
187335 ret x86_mmx %2
189337 declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
190338
191339 define x86_mmx @stack_fold_pi2fw(x86_mmx %a) {
192 ;CHECK-LABEL: stack_fold_pi2fw
193 ;CHECK: pi2fw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
340 ; CHECK-LABEL: stack_fold_pi2fw:
341 ; CHECK: # %bb.0:
342 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
343 ; CHECK-NEXT: #APP
344 ; CHECK-NEXT: nop
345 ; CHECK-NEXT: #NO_APP
346 ; CHECK-NEXT: pi2fw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
347 ; CHECK-NEXT: movq2dq %mm0, %xmm0
348 ; CHECK-NEXT: retq
194349 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
195350 %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %a) nounwind readnone
196351 ret x86_mmx %2
198353 declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
199354
200355 define x86_mmx @stack_fold_pmulhrw(x86_mmx %a, x86_mmx %b) {
201 ;CHECK-LABEL: stack_fold_pmulhrw
202 ;CHECK: pmulhrw {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
356 ; CHECK-LABEL: stack_fold_pmulhrw:
357 ; CHECK: # %bb.0:
358 ; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
359 ; CHECK-NEXT: #APP
360 ; CHECK-NEXT: nop
361 ; CHECK-NEXT: #NO_APP
362 ; CHECK-NEXT: pmulhrw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
363 ; CHECK-NEXT: movq2dq %mm0, %xmm0
364 ; CHECK-NEXT: retq
203365 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
204366 %2 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a, x86_mmx %b) nounwind readnone
205367 ret x86_mmx %2
207369 declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
208370
209371 define x86_mmx @stack_fold_pswapd(x86_mmx %a) {
210 ;CHECK-LABEL: stack_fold_pswapd
211 ;CHECK: pswapd {{-?[0-9]*}}(%rsp), {{%mm[0-7]}} {{.*#+}} 8-byte Folded Reload
372 ; CHECK-LABEL: stack_fold_pswapd:
373 ; CHECK: # %bb.0:
374 ; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
375 ; CHECK-NEXT: #APP
376 ; CHECK-NEXT: nop
377 ; CHECK-NEXT: #NO_APP
378 ; CHECK-NEXT: pswapd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
379 ; CHECK-NEXT: # mm0 = mem[1,0]
380 ; CHECK-NEXT: movq2dq %mm0, %xmm0
381 ; CHECK-NEXT: retq
212382 %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
213383 %2 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %a) nounwind readnone
214384 ret x86_mmx %2
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+adx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ADX
12 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=-adx < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NOADX
23
910 ; relevant registers and check that the reload is correctly folded into the instruction.
1011
1112 define i8 @stack_fold_addcarry_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
12 ;CHECK-LABEL: stack_fold_addcarry_u32
13 ;CHECK: adcl {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 4-byte Folded Reload
13 ; CHECK-LABEL: stack_fold_addcarry_u32:
14 ; CHECK: # %bb.0:
15 ; CHECK-NEXT: pushq %rbp
16 ; CHECK-NEXT: .cfi_def_cfa_offset 16
17 ; CHECK-NEXT: pushq %r15
18 ; CHECK-NEXT: .cfi_def_cfa_offset 24
19 ; CHECK-NEXT: pushq %r14
20 ; CHECK-NEXT: .cfi_def_cfa_offset 32
21 ; CHECK-NEXT: pushq %r13
22 ; CHECK-NEXT: .cfi_def_cfa_offset 40
23 ; CHECK-NEXT: pushq %r12
24 ; CHECK-NEXT: .cfi_def_cfa_offset 48
25 ; CHECK-NEXT: pushq %rbx
26 ; CHECK-NEXT: .cfi_def_cfa_offset 56
27 ; CHECK-NEXT: .cfi_offset %rbx, -56
28 ; CHECK-NEXT: .cfi_offset %r12, -48
29 ; CHECK-NEXT: .cfi_offset %r13, -40
30 ; CHECK-NEXT: .cfi_offset %r14, -32
31 ; CHECK-NEXT: .cfi_offset %r15, -24
32 ; CHECK-NEXT: .cfi_offset %rbp, -16
33 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
34 ; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
35 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
36 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
37 ; CHECK-NEXT: #APP
38 ; CHECK-NEXT: nop
39 ; CHECK-NEXT: #NO_APP
40 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
41 ; CHECK-NEXT: addb $-1, %al
42 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
43 ; CHECK-NEXT: adcl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
44 ; CHECK-NEXT: setb %al
45 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
46 ; CHECK-NEXT: movl %edx, (%rcx)
47 ; CHECK-NEXT: popq %rbx
48 ; CHECK-NEXT: .cfi_def_cfa_offset 48
49 ; CHECK-NEXT: popq %r12
50 ; CHECK-NEXT: .cfi_def_cfa_offset 40
51 ; CHECK-NEXT: popq %r13
52 ; CHECK-NEXT: .cfi_def_cfa_offset 32
53 ; CHECK-NEXT: popq %r14
54 ; CHECK-NEXT: .cfi_def_cfa_offset 24
55 ; CHECK-NEXT: popq %r15
56 ; CHECK-NEXT: .cfi_def_cfa_offset 16
57 ; CHECK-NEXT: popq %rbp
58 ; CHECK-NEXT: .cfi_def_cfa_offset 8
59 ; CHECK-NEXT: retq
1460 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1561 %2 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2)
1662 %3 = extractvalue { i8, i32 } %2, 1
2167 }
2268
2369 define i8 @stack_fold_addcarry_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
24 ;CHECK-LABEL: stack_fold_addcarry_u64
25 ;CHECK: adcq {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 8-byte Folded Reload
70 ; CHECK-LABEL: stack_fold_addcarry_u64:
71 ; CHECK: # %bb.0:
72 ; CHECK-NEXT: pushq %rbp
73 ; CHECK-NEXT: .cfi_def_cfa_offset 16
74 ; CHECK-NEXT: pushq %r15
75 ; CHECK-NEXT: .cfi_def_cfa_offset 24
76 ; CHECK-NEXT: pushq %r14
77 ; CHECK-NEXT: .cfi_def_cfa_offset 32
78 ; CHECK-NEXT: pushq %r13
79 ; CHECK-NEXT: .cfi_def_cfa_offset 40
80 ; CHECK-NEXT: pushq %r12
81 ; CHECK-NEXT: .cfi_def_cfa_offset 48
82 ; CHECK-NEXT: pushq %rbx
83 ; CHECK-NEXT: .cfi_def_cfa_offset 56
84 ; CHECK-NEXT: .cfi_offset %rbx, -56
85 ; CHECK-NEXT: .cfi_offset %r12, -48
86 ; CHECK-NEXT: .cfi_offset %r13, -40
87 ; CHECK-NEXT: .cfi_offset %r14, -32
88 ; CHECK-NEXT: .cfi_offset %r15, -24
89 ; CHECK-NEXT: .cfi_offset %rbp, -16
90 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
91 ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
92 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
93 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
94 ; CHECK-NEXT: #APP
95 ; CHECK-NEXT: nop
96 ; CHECK-NEXT: #NO_APP
97 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
98 ; CHECK-NEXT: addb $-1, %al
99 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
100 ; CHECK-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
101 ; CHECK-NEXT: setb %al
102 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
103 ; CHECK-NEXT: movq %rdx, (%rcx)
104 ; CHECK-NEXT: popq %rbx
105 ; CHECK-NEXT: .cfi_def_cfa_offset 48
106 ; CHECK-NEXT: popq %r12
107 ; CHECK-NEXT: .cfi_def_cfa_offset 40
108 ; CHECK-NEXT: popq %r13
109 ; CHECK-NEXT: .cfi_def_cfa_offset 32
110 ; CHECK-NEXT: popq %r14
111 ; CHECK-NEXT: .cfi_def_cfa_offset 24
112 ; CHECK-NEXT: popq %r15
113 ; CHECK-NEXT: .cfi_def_cfa_offset 16
114 ; CHECK-NEXT: popq %rbp
115 ; CHECK-NEXT: .cfi_def_cfa_offset 8
116 ; CHECK-NEXT: retq
26117 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
27118 %2 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2)
28119 %3 = extractvalue { i8, i64 } %2, 1
33124 }
34125
35126 define i8 @stack_fold_addcarryx_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
36 ;CHECK-LABEL: stack_fold_addcarryx_u32
37 ;CHECK: adcl {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 4-byte Folded Reload
127 ; CHECK-LABEL: stack_fold_addcarryx_u32:
128 ; CHECK: # %bb.0:
129 ; CHECK-NEXT: pushq %rbp
130 ; CHECK-NEXT: .cfi_def_cfa_offset 16
131 ; CHECK-NEXT: pushq %r15
132 ; CHECK-NEXT: .cfi_def_cfa_offset 24
133 ; CHECK-NEXT: pushq %r14
134 ; CHECK-NEXT: .cfi_def_cfa_offset 32
135 ; CHECK-NEXT: pushq %r13
136 ; CHECK-NEXT: .cfi_def_cfa_offset 40
137 ; CHECK-NEXT: pushq %r12
138 ; CHECK-NEXT: .cfi_def_cfa_offset 48
139 ; CHECK-NEXT: pushq %rbx
140 ; CHECK-NEXT: .cfi_def_cfa_offset 56
141 ; CHECK-NEXT: .cfi_offset %rbx, -56
142 ; CHECK-NEXT: .cfi_offset %r12, -48
143 ; CHECK-NEXT: .cfi_offset %r13, -40
144 ; CHECK-NEXT: .cfi_offset %r14, -32
145 ; CHECK-NEXT: .cfi_offset %r15, -24
146 ; CHECK-NEXT: .cfi_offset %rbp, -16
147 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
148 ; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
149 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
150 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
151 ; CHECK-NEXT: #APP
152 ; CHECK-NEXT: nop
153 ; CHECK-NEXT: #NO_APP
154 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
155 ; CHECK-NEXT: addb $-1, %al
156 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
157 ; CHECK-NEXT: adcl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
158 ; CHECK-NEXT: setb %al
159 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
160 ; CHECK-NEXT: movl %edx, (%rcx)
161 ; CHECK-NEXT: popq %rbx
162 ; CHECK-NEXT: .cfi_def_cfa_offset 48
163 ; CHECK-NEXT: popq %r12
164 ; CHECK-NEXT: .cfi_def_cfa_offset 40
165 ; CHECK-NEXT: popq %r13
166 ; CHECK-NEXT: .cfi_def_cfa_offset 32
167 ; CHECK-NEXT: popq %r14
168 ; CHECK-NEXT: .cfi_def_cfa_offset 24
169 ; CHECK-NEXT: popq %r15
170 ; CHECK-NEXT: .cfi_def_cfa_offset 16
171 ; CHECK-NEXT: popq %rbp
172 ; CHECK-NEXT: .cfi_def_cfa_offset 8
173 ; CHECK-NEXT: retq
38174 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
39175 %2 = call { i8, i32 } @llvm.x86.addcarry.32(i8 %a0, i32 %a1, i32 %a2)
40176 %3 = extractvalue { i8, i32 } %2, 1
45181 }
46182
47183 define i8 @stack_fold_addcarryx_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
48 ;CHECK-LABEL: stack_fold_addcarryx_u64
49 ;CHECK: adcq {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 8-byte Folded Reload
184 ; CHECK-LABEL: stack_fold_addcarryx_u64:
185 ; CHECK: # %bb.0:
186 ; CHECK-NEXT: pushq %rbp
187 ; CHECK-NEXT: .cfi_def_cfa_offset 16
188 ; CHECK-NEXT: pushq %r15
189 ; CHECK-NEXT: .cfi_def_cfa_offset 24
190 ; CHECK-NEXT: pushq %r14
191 ; CHECK-NEXT: .cfi_def_cfa_offset 32
192 ; CHECK-NEXT: pushq %r13
193 ; CHECK-NEXT: .cfi_def_cfa_offset 40
194 ; CHECK-NEXT: pushq %r12
195 ; CHECK-NEXT: .cfi_def_cfa_offset 48
196 ; CHECK-NEXT: pushq %rbx
197 ; CHECK-NEXT: .cfi_def_cfa_offset 56
198 ; CHECK-NEXT: .cfi_offset %rbx, -56
199 ; CHECK-NEXT: .cfi_offset %r12, -48
200 ; CHECK-NEXT: .cfi_offset %r13, -40
201 ; CHECK-NEXT: .cfi_offset %r14, -32
202 ; CHECK-NEXT: .cfi_offset %r15, -24
203 ; CHECK-NEXT: .cfi_offset %rbp, -16
204 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
205 ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
206 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
207 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
208 ; CHECK-NEXT: #APP
209 ; CHECK-NEXT: nop
210 ; CHECK-NEXT: #NO_APP
211 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
212 ; CHECK-NEXT: addb $-1, %al
213 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
214 ; CHECK-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
215 ; CHECK-NEXT: setb %al
216 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
217 ; CHECK-NEXT: movq %rdx, (%rcx)
218 ; CHECK-NEXT: popq %rbx
219 ; CHECK-NEXT: .cfi_def_cfa_offset 48
220 ; CHECK-NEXT: popq %r12
221 ; CHECK-NEXT: .cfi_def_cfa_offset 40
222 ; CHECK-NEXT: popq %r13
223 ; CHECK-NEXT: .cfi_def_cfa_offset 32
224 ; CHECK-NEXT: popq %r14
225 ; CHECK-NEXT: .cfi_def_cfa_offset 24
226 ; CHECK-NEXT: popq %r15
227 ; CHECK-NEXT: .cfi_def_cfa_offset 16
228 ; CHECK-NEXT: popq %rbp
229 ; CHECK-NEXT: .cfi_def_cfa_offset 8
230 ; CHECK-NEXT: retq
50231 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
51232 %2 = call { i8, i64 } @llvm.x86.addcarry.64(i8 %a0, i64 %a1, i64 %a2)
52233 %3 = extractvalue { i8, i64 } %2, 1
57238 }
58239
59240 define i8 @stack_fold_subborrow_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
60 ;CHECK-LABEL: stack_fold_subborrow_u32
61 ;CHECK: sbbl {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 4-byte Folded Reload
241 ; CHECK-LABEL: stack_fold_subborrow_u32:
242 ; CHECK: # %bb.0:
243 ; CHECK-NEXT: pushq %rbp
244 ; CHECK-NEXT: .cfi_def_cfa_offset 16
245 ; CHECK-NEXT: pushq %r15
246 ; CHECK-NEXT: .cfi_def_cfa_offset 24
247 ; CHECK-NEXT: pushq %r14
248 ; CHECK-NEXT: .cfi_def_cfa_offset 32
249 ; CHECK-NEXT: pushq %r13
250 ; CHECK-NEXT: .cfi_def_cfa_offset 40
251 ; CHECK-NEXT: pushq %r12
252 ; CHECK-NEXT: .cfi_def_cfa_offset 48
253 ; CHECK-NEXT: pushq %rbx
254 ; CHECK-NEXT: .cfi_def_cfa_offset 56
255 ; CHECK-NEXT: .cfi_offset %rbx, -56
256 ; CHECK-NEXT: .cfi_offset %r12, -48
257 ; CHECK-NEXT: .cfi_offset %r13, -40
258 ; CHECK-NEXT: .cfi_offset %r14, -32
259 ; CHECK-NEXT: .cfi_offset %r15, -24
260 ; CHECK-NEXT: .cfi_offset %rbp, -16
261 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
262 ; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
263 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
264 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
265 ; CHECK-NEXT: #APP
266 ; CHECK-NEXT: nop
267 ; CHECK-NEXT: #NO_APP
268 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
269 ; CHECK-NEXT: addb $-1, %al
270 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
271 ; CHECK-NEXT: sbbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
272 ; CHECK-NEXT: setb %al
273 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
274 ; CHECK-NEXT: movl %edx, (%rcx)
275 ; CHECK-NEXT: popq %rbx
276 ; CHECK-NEXT: .cfi_def_cfa_offset 48
277 ; CHECK-NEXT: popq %r12
278 ; CHECK-NEXT: .cfi_def_cfa_offset 40
279 ; CHECK-NEXT: popq %r13
280 ; CHECK-NEXT: .cfi_def_cfa_offset 32
281 ; CHECK-NEXT: popq %r14
282 ; CHECK-NEXT: .cfi_def_cfa_offset 24
283 ; CHECK-NEXT: popq %r15
284 ; CHECK-NEXT: .cfi_def_cfa_offset 16
285 ; CHECK-NEXT: popq %rbp
286 ; CHECK-NEXT: .cfi_def_cfa_offset 8
287 ; CHECK-NEXT: retq
62288 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
63289 %2 = call { i8, i32 } @llvm.x86.subborrow.32(i8 %a0, i32 %a1, i32 %a2)
64290 %3 = extractvalue { i8, i32 } %2, 1
69295 }
70296
71297 define i8 @stack_fold_subborrow_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
72 ;CHECK-LABEL: stack_fold_subborrow_u64
73 ;CHECK: sbbq {{-?[0-9]*}}(%rsp), %{{.*}} {{.*#+}} 8-byte Folded Reload
298 ; CHECK-LABEL: stack_fold_subborrow_u64:
299 ; CHECK: # %bb.0:
300 ; CHECK-NEXT: pushq %rbp
301 ; CHECK-NEXT: .cfi_def_cfa_offset 16
302 ; CHECK-NEXT: pushq %r15
303 ; CHECK-NEXT: .cfi_def_cfa_offset 24
304 ; CHECK-NEXT: pushq %r14
305 ; CHECK-NEXT: .cfi_def_cfa_offset 32
306 ; CHECK-NEXT: pushq %r13
307 ; CHECK-NEXT: .cfi_def_cfa_offset 40
308 ; CHECK-NEXT: pushq %r12
309 ; CHECK-NEXT: .cfi_def_cfa_offset 48
310 ; CHECK-NEXT: pushq %rbx
311 ; CHECK-NEXT: .cfi_def_cfa_offset 56
312 ; CHECK-NEXT: .cfi_offset %rbx, -56
313 ; CHECK-NEXT: .cfi_offset %r12, -48
314 ; CHECK-NEXT: .cfi_offset %r13, -40
315 ; CHECK-NEXT: .cfi_offset %r14, -32
316 ; CHECK-NEXT: .cfi_offset %r15, -24
317 ; CHECK-NEXT: .cfi_offset %rbp, -16
318 ; CHECK-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
319 ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
320 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
321 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
322 ; CHECK-NEXT: #APP
323 ; CHECK-NEXT: nop
324 ; CHECK-NEXT: #NO_APP
325 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
326 ; CHECK-NEXT: addb $-1, %al
327 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
328 ; CHECK-NEXT: sbbq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
329 ; CHECK-NEXT: setb %al
330 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
331 ; CHECK-NEXT: movq %rdx, (%rcx)
332 ; CHECK-NEXT: popq %rbx
333 ; CHECK-NEXT: .cfi_def_cfa_offset 48
334 ; CHECK-NEXT: popq %r12
335 ; CHECK-NEXT: .cfi_def_cfa_offset 40
336 ; CHECK-NEXT: popq %r13
337 ; CHECK-NEXT: .cfi_def_cfa_offset 32
338 ; CHECK-NEXT: popq %r14
339 ; CHECK-NEXT: .cfi_def_cfa_offset 24
340 ; CHECK-NEXT: popq %r15
341 ; CHECK-NEXT: .cfi_def_cfa_offset 16
342 ; CHECK-NEXT: popq %rbp
343 ; CHECK-NEXT: .cfi_def_cfa_offset 8
344 ; CHECK-NEXT: retq
74345 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
75346 %2 = call { i8, i64 } @llvm.x86.subborrow.64(i8 %a0, i64 %a1, i64 %a2)
76347 %3 = extractvalue { i8, i64 } %2, 1
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16,+avx512vl < %s | FileCheck %s
12
23 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
89 ; relevant registers and check that the reload is correctly folded into the instruction.
910
1011 define <32 x i16> @stack_fold_cvtne2ps2bf16(<16 x float> %a0, <16 x float> %a1) {
11 ;CHECK-LABEL: stack_fold_cvtne2ps2bf16:
12 ;CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
12 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16:
13 ; CHECK: # %bb.0:
14 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15 ; CHECK-NEXT: #APP
16 ; CHECK-NEXT: nop
17 ; CHECK-NEXT: #NO_APP
18 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
19 ; CHECK-NEXT: retq
1320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1421 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
1522 ret <32 x i16> %2
1825
1926 define <32 x i16> @stack_fold_cvtne2ps2bf16_mask(<16 x float> %a0, <16 x float> %a1, <32 x i16>* %passthru, i32 %U) {
2027 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask:
21 ; CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
28 ; CHECK: # %bb.0:
29 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
30 ; CHECK-NEXT: #APP
31 ; CHECK-NEXT: nop
32 ; CHECK-NEXT: #NO_APP
33 ; CHECK-NEXT: kmovd %esi, %k1
34 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
35 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
36 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
37 ; CHECK-NEXT: retq
2238 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2339 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
2440 %3 = bitcast i32 %U to <32 x i1>
3046
3147 define <32 x i16> @stack_fold_cvtne2ps2bf16_maskz(<16 x float> %a0, <16 x float> %a1, i32 %U) {
3248 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz:
33 ; CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
49 ; CHECK: # %bb.0:
50 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
51 ; CHECK-NEXT: #APP
52 ; CHECK-NEXT: nop
53 ; CHECK-NEXT: #NO_APP
54 ; CHECK-NEXT: kmovd %edi, %k1
55 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
56 ; CHECK-NEXT: retq
3457 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3558 %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
3659 %3 = bitcast i32 %U to <32 x i1>
4063
4164 define <16 x i16> @stack_fold_cvtneps2bf16(<16 x float> %a0) {
4265 ; CHECK-LABEL: stack_fold_cvtneps2bf16:
43 ; CHECK: vcvtneps2bf16 {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
66 ; CHECK: # %bb.0:
67 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
68 ; CHECK-NEXT: #APP
69 ; CHECK-NEXT: nop
70 ; CHECK-NEXT: #NO_APP
71 ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload
72 ; CHECK-NEXT: retq
4473 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4574 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
4675 ret <16 x i16> %2
4978
5079 define <16 x i16> @stack_fold_cvtneps2bf16_mask(<16 x float> %a0, <16 x i16>* %passthru, i16 %U) {
5180 ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask:
52 ; CHECK: vcvtneps2bf16 {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
81 ; CHECK: # %bb.0:
82 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
83 ; CHECK-NEXT: #APP
84 ; CHECK-NEXT: nop
85 ; CHECK-NEXT: #NO_APP
86 ; CHECK-NEXT: kmovd %esi, %k1
87 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
88 ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 64-byte Folded Reload
89 ; CHECK-NEXT: vmovaps %ymm1, %ymm0
90 ; CHECK-NEXT: retq
5391 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5492 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
5593 %3 = bitcast i16 %U to <16 x i1>
6199
62100 define <16 x i16> @stack_fold_cvtneps2bf16_maskz(<16 x float> %a0, i16 %U) {
63101 ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz:
64 ; CHECK: vcvtneps2bf16 {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
102 ; CHECK: # %bb.0:
103 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
104 ; CHECK-NEXT: #APP
105 ; CHECK-NEXT: nop
106 ; CHECK-NEXT: #NO_APP
107 ; CHECK-NEXT: kmovd %edi, %k1
108 ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 64-byte Folded Reload
109 ; CHECK-NEXT: retq
65110 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
66111 %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
67112 %3 = bitcast i16 %U to <16 x i1>
71116
72117 define <16 x float> @stack_fold_vdpbf16ps(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2) {
73118 ; CHECK-LABEL: stack_fold_vdpbf16ps:
74 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
119 ; CHECK: # %bb.0:
120 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
121 ; CHECK-NEXT: #APP
122 ; CHECK-NEXT: nop
123 ; CHECK-NEXT: #NO_APP
124 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
125 ; CHECK-NEXT: retq
75126 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
76127 %2 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2)
77128 ret <16 x float> %2
80131
81132 define <16 x float> @stack_fold_vdpbf16ps_mask(<16 x float>* %a0, <16 x i32> %a1, <16 x i32> %a2, <16 x float>* %passthru, i16 %U) {
82133 ; CHECK-LABEL: stack_fold_vdpbf16ps_mask:
83 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
134 ; CHECK: # %bb.0:
135 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
136 ; CHECK-NEXT: #APP
137 ; CHECK-NEXT: nop
138 ; CHECK-NEXT: #NO_APP
139 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
140 ; CHECK-NEXT: kmovd %edx, %k1
141 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
142 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
143 ; CHECK-NEXT: retq
84144 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
85145 ; load needed to keep the operation from being scheduled above the asm block
86146 %2 = load <16 x float>, <16 x float>* %a0
92152
93153 define <16 x float> @stack_fold_vdpbf16ps_maskz(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %U) {
94154 ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz:
95 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
155 ; CHECK: # %bb.0:
156 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
157 ; CHECK-NEXT: #APP
158 ; CHECK-NEXT: nop
159 ; CHECK-NEXT: #NO_APP
160 ; CHECK-NEXT: kmovw (%rdi), %k1
161 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
162 ; CHECK-NEXT: retq
96163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
97164 %2 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2)
98165 %3 = load i16, i16* %U
104171
105172
106173 define <16 x i16> @stack_fold_cvtne2ps2bf16_ymm(<8 x float> %a0, <8 x float> %a1) {
107 ;CHECK-LABEL: stack_fold_cvtne2ps2bf16_ymm:
108 ;CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
174 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_ymm:
175 ; CHECK: # %bb.0:
176 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
177 ; CHECK-NEXT: #APP
178 ; CHECK-NEXT: nop
179 ; CHECK-NEXT: #NO_APP
180 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
181 ; CHECK-NEXT: retq
109182 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
110183 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
111184 ret <16 x i16> %2
114187
115188 define <16 x i16> @stack_fold_cvtne2ps2bf16_mask_ymm(<8 x float> %a0, <8 x float> %a1, <16 x i16>* %passthru, i16 %U) {
116189 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_ymm:
117 ; CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
190 ; CHECK: # %bb.0:
191 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
192 ; CHECK-NEXT: #APP
193 ; CHECK-NEXT: nop
194 ; CHECK-NEXT: #NO_APP
195 ; CHECK-NEXT: kmovd %esi, %k1
196 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
197 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
198 ; CHECK-NEXT: vmovdqa %ymm2, %ymm0
199 ; CHECK-NEXT: retq
118200 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
119201 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
120202 %3 = bitcast i16 %U to <16 x i1>
126208
127209 define <16 x i16> @stack_fold_cvtne2ps2bf16_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i16 %U) {
128210 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_ymm:
129 ; CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
211 ; CHECK: # %bb.0:
212 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
213 ; CHECK-NEXT: #APP
214 ; CHECK-NEXT: nop
215 ; CHECK-NEXT: #NO_APP
216 ; CHECK-NEXT: kmovd %edi, %k1
217 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
218 ; CHECK-NEXT: retq
130219 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
131220 %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
132221 %3 = bitcast i16 %U to <16 x i1>
136225
137226 define <8 x i16> @stack_fold_cvtneps2bf16_ymm(<8 x float> %a0) {
138227 ; CHECK-LABEL: stack_fold_cvtneps2bf16_ymm:
139 ; CHECK: vcvtneps2bf16y {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
228 ; CHECK: # %bb.0:
229 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
230 ; CHECK-NEXT: #APP
231 ; CHECK-NEXT: nop
232 ; CHECK-NEXT: #NO_APP
233 ; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
234 ; CHECK-NEXT: vzeroupper
235 ; CHECK-NEXT: retq
140236 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
141237 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
142238 ret <8 x i16> %2
145241
146242 define <8 x i16> @stack_fold_cvtneps2bf16_mask_ymm(<8 x float> %a0, <8 x i16>* %passthru, i8 %U) {
147243 ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_ymm:
148 ; CHECK: vcvtneps2bf16y {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
244 ; CHECK: # %bb.0:
245 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
246 ; CHECK-NEXT: #APP
247 ; CHECK-NEXT: nop
248 ; CHECK-NEXT: #NO_APP
249 ; CHECK-NEXT: kmovd %esi, %k1
250 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
251 ; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 32-byte Folded Reload
252 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
253 ; CHECK-NEXT: vzeroupper
254 ; CHECK-NEXT: retq
149255 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
150256 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
151257 %3 = bitcast i8 %U to <8 x i1>
157263
158264 define <8 x i16> @stack_fold_cvtneps2bf16_maskz_ymm(<8 x float> %a0, i8 %U) {
159265 ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_ymm:
160 ; CHECK: vcvtneps2bf16y {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
266 ; CHECK: # %bb.0:
267 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
268 ; CHECK-NEXT: #APP
269 ; CHECK-NEXT: nop
270 ; CHECK-NEXT: #NO_APP
271 ; CHECK-NEXT: kmovd %edi, %k1
272 ; CHECK-NEXT: vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 32-byte Folded Reload
273 ; CHECK-NEXT: vzeroupper
274 ; CHECK-NEXT: retq
161275 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
162276 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
163277 %3 = bitcast i8 %U to <8 x i1>
167281
168282 define <8 x float> @stack_fold_vdpbf16ps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2) {
169283 ; CHECK-LABEL: stack_fold_vdpbf16ps_ymm:
170 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
284 ; CHECK: # %bb.0:
285 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
286 ; CHECK-NEXT: #APP
287 ; CHECK-NEXT: nop
288 ; CHECK-NEXT: #NO_APP
289 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
290 ; CHECK-NEXT: retq
171291 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
172292 %2 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2)
173293 ret <8 x float> %2
176296
177297 define <8 x float> @stack_fold_vdpbf16ps_mask_ymm(<8 x float>* %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x float>* %passthru, i8 %U) {
178298 ; CHECK-LABEL: stack_fold_vdpbf16ps_mask_ymm:
179 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
299 ; CHECK: # %bb.0:
300 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
301 ; CHECK-NEXT: #APP
302 ; CHECK-NEXT: nop
303 ; CHECK-NEXT: #NO_APP
304 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
305 ; CHECK-NEXT: kmovd %edx, %k1
306 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
307 ; CHECK-NEXT: vmovaps %ymm2, %ymm0
308 ; CHECK-NEXT: retq
180309 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
181310 ; load needed to keep the operation from being scheduled above the asm block
182311 %2 = load <8 x float>, <8 x float>* %a0
188317
189318 define <8 x float> @stack_fold_vdpbf16ps_maskz_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2, i8* %U) {
190319 ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_ymm:
191 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
320 ; CHECK: # %bb.0:
321 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
322 ; CHECK-NEXT: #APP
323 ; CHECK-NEXT: nop
324 ; CHECK-NEXT: #NO_APP
325 ; CHECK-NEXT: movzbl (%rdi), %eax
326 ; CHECK-NEXT: kmovd %eax, %k1
327 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
328 ; CHECK-NEXT: retq
192329 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
193330 %2 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2)
194331 %3 = load i8, i8* %U
201338
202339
203340 define <8 x i16> @stack_fold_cvtne2ps2bf16_xmm(<4 x float> %a0, <4 x float> %a1) {
204 ;CHECK-LABEL: stack_fold_cvtne2ps2bf16_xmm:
205 ;CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
341 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_xmm:
342 ; CHECK: # %bb.0:
343 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
344 ; CHECK-NEXT: #APP
345 ; CHECK-NEXT: nop
346 ; CHECK-NEXT: #NO_APP
347 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
348 ; CHECK-NEXT: retq
206349 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
207350 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
208351 ret <8 x i16> %2
211354
212355 define <8 x i16> @stack_fold_cvtne2ps2bf16_mask_xmm(<4 x float> %a0, <4 x float> %a1, <8 x i16>* %passthru, i8 %U) {
213356 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_xmm:
214 ; CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
357 ; CHECK: # %bb.0:
358 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
359 ; CHECK-NEXT: #APP
360 ; CHECK-NEXT: nop
361 ; CHECK-NEXT: #NO_APP
362 ; CHECK-NEXT: kmovd %esi, %k1
363 ; CHECK-NEXT: vmovdqa (%rdi), %xmm2
364 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
365 ; CHECK-NEXT: vmovdqa %xmm2, %xmm0
366 ; CHECK-NEXT: retq
215367 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
216368 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
217369 %3 = bitcast i8 %U to <8 x i1>
223375
224376 define <8 x i16> @stack_fold_cvtne2ps2bf16_maskz_xmm(<4 x float> %a0, <4 x float> %a1, i8 %U) {
225377 ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_xmm:
226 ; CHECK: vcvtne2ps2bf16 {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
378 ; CHECK: # %bb.0:
379 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
380 ; CHECK-NEXT: #APP
381 ; CHECK-NEXT: nop
382 ; CHECK-NEXT: #NO_APP
383 ; CHECK-NEXT: kmovd %edi, %k1
384 ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
385 ; CHECK-NEXT: retq
227386 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
228387 %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
229388 %3 = bitcast i8 %U to <8 x i1>
233392
234393 define <8 x i16> @stack_fold_cvtneps2bf16_xmm(<4 x float> %a0) {
235394 ; CHECK-LABEL: stack_fold_cvtneps2bf16_xmm:
236 ; CHECK: vcvtneps2bf16x {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
395 ; CHECK: # %bb.0:
396 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
397 ; CHECK-NEXT: #APP
398 ; CHECK-NEXT: nop
399 ; CHECK-NEXT: #NO_APP
400 ; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
401 ; CHECK-NEXT: retq
237402 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
238403 %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> undef, <4 x i1> )
239404 ret <8 x i16> %2
242407
243408 define <8 x i16> @stack_fold_cvtneps2bf16_mask_xmm(<4 x float> %a0, <8 x i16>* %passthru, i8 %U) {
244409 ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_xmm:
245 ; CHECK: vcvtneps2bf16x {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
410 ; CHECK: # %bb.0:
411 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
412 ; CHECK-NEXT: #APP
413 ; CHECK-NEXT: nop
414 ; CHECK-NEXT: #NO_APP
415 ; CHECK-NEXT: vmovaps (%rdi), %xmm1
416 ; CHECK-NEXT: kmovd %esi, %k1
417 ; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
418 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
419 ; CHECK-NEXT: retq
246420 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
247421 %2 = load <8 x i16>, <8 x i16>* %passthru
248422 %3 = bitcast i8 %U to <8 x i1>
253427
254428 define <8 x i16> @stack_fold_cvtneps2bf16_maskz_xmm(<4 x float> %a0, i8 %U) {
255429 ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_xmm:
256 ; CHECK: vcvtneps2bf16x {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
430 ; CHECK: # %bb.0:
431 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
432 ; CHECK-NEXT: #APP
433 ; CHECK-NEXT: nop
434 ; CHECK-NEXT: #NO_APP
435 ; CHECK-NEXT: kmovd %edi, %k1
436 ; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
437 ; CHECK-NEXT: retq
257438 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
258439 %2 = bitcast i8 %U to <8 x i1>
259440 %3 = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32>
263444
264445 define <4 x float> @stack_fold_vdpbf16ps_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) {
265446 ; CHECK-LABEL: stack_fold_vdpbf16ps_xmm:
266 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
447 ; CHECK: # %bb.0:
448 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
449 ; CHECK-NEXT: #APP
450 ; CHECK-NEXT: nop
451 ; CHECK-NEXT: #NO_APP
452 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
453 ; CHECK-NEXT: retq
267454 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
268455 %2 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2)
269456 ret <4 x float> %2
272459
273460 define <4 x float> @stack_fold_vdpbf16ps_mask_xmm(<4 x float>* %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x float>* %passthru, i8 %U) {
274461 ; CHECK-LABEL: stack_fold_vdpbf16ps_mask_xmm:
275 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
462 ; CHECK: # %bb.0:
463 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
464 ; CHECK-NEXT: #APP
465 ; CHECK-NEXT: nop
466 ; CHECK-NEXT: #NO_APP
467 ; CHECK-NEXT: vmovaps (%rdi), %xmm2
468 ; CHECK-NEXT: kmovd %edx, %k1
469 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
470 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
471 ; CHECK-NEXT: retq
276472 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
277473 ; load needed to keep the operation from being scheduled above the asm block
278474 %2 = load <4 x float>, <4 x float>* %a0
285481
286482 define <4 x float> @stack_fold_vdpbf16ps_maskz_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2, i8* %U) {
287483 ; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_xmm:
288 ; CHECK: vdpbf16ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
484 ; CHECK: # %bb.0:
485 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
486 ; CHECK-NEXT: #APP
487 ; CHECK-NEXT: nop
488 ; CHECK-NEXT: #NO_APP
489 ; CHECK-NEXT: movzbl (%rdi), %eax
490 ; CHECK-NEXT: kmovd %eax, %k1
491 ; CHECK-NEXT: vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
492 ; CHECK-NEXT: retq
289493 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
290494 %2 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2)
291495 %3 = load i8, i8* %U
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl < %s | FileCheck %s
12
23 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
45
56 define void @stack_fold_vp2intersectd(<16 x i32>* %a, <16 x i32> %b, <16 x i1>* nocapture %m0, <16 x i1>* nocapture %m1) {
67 ; CHECK-LABEL: stack_fold_vp2intersectd:
7 ; CHECK: vp2intersectd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
8 ; CHECK: # %bb.0:
9 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10 ; CHECK-NEXT: #APP
11 ; CHECK-NEXT: nop
12 ; CHECK-NEXT: #NO_APP
13 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
14 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
15 ; CHECK-NEXT: kmovw %k0, (%rsi)
16 ; CHECK-NEXT: kmovw %k1, (%rdx)
17 ; CHECK-NEXT: vzeroupper
18 ; CHECK-NEXT: retq
819 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
920 %2 = load <16 x i32>, <16 x i32>* %a
1021 %3 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %2, <16 x i32> %b)
1829
1930 define void @stack_fold_vp2intersectq(<8 x i64>* %a, <8 x i64> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) {
2031 ; CHECK-LABEL: stack_fold_vp2intersectq:
21 ; CHECK: vp2intersectq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
32 ; CHECK: # %bb.0:
33 ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
34 ; CHECK-NEXT: #APP
35 ; CHECK-NEXT: nop
36 ; CHECK-NEXT: #NO_APP
37 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
38 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
39 ; CHECK-NEXT: kmovw %k1, %eax
40 ; CHECK-NEXT: kmovw %k0, %ecx
41 ; CHECK-NEXT: movb %cl, (%rsi)
42 ; CHECK-NEXT: movb %al, (%rdx)
43 ; CHECK-NEXT: vzeroupper
44 ; CHECK-NEXT: retq
2245 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2346 %2 = load <8 x i64>, <8 x i64>* %a
2447 %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %2, <8 x i64> %b)
3255
3356 define void @stack_fold_vp2intersectd_256(<8 x i32>* %a, <8 x i32> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) {
3457 ; CHECK-LABEL: stack_fold_vp2intersectd_256:
35 ; CHECK: vp2intersectd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 32-byte Folded Reload
58 ; CHECK: # %bb.0:
59 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
60 ; CHECK-NEXT: #APP
61 ; CHECK-NEXT: nop
62 ; CHECK-NEXT: #NO_APP
63 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
64 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
65 ; CHECK-NEXT: kmovw %k1, %eax
66 ; CHECK-NEXT: kmovw %k0, %ecx
67 ; CHECK-NEXT: movb %cl, (%rsi)
68 ; CHECK-NEXT: movb %al, (%rdx)
69 ; CHECK-NEXT: vzeroupper
70 ; CHECK-NEXT: retq
3671 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3772 %2 = load <8 x i32>, <8 x i32>* %a
3873 %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %2, <8 x i32> %b)
4681
4782 define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) {
4883 ; CHECK-LABEL: stack_fold_vp2intersectq_256:
49 ; CHECK: vp2intersectq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 32-byte Folded Reload
84 ; CHECK: # %bb.0:
85 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
86 ; CHECK-NEXT: #APP
87 ; CHECK-NEXT: nop
88 ; CHECK-NEXT: #NO_APP
89 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
90 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
91 ; CHECK-NEXT: kmovw %k1, %eax
92 ; CHECK-NEXT: kmovw %k0, %ecx
93 ; CHECK-NEXT: movb %cl, (%rsi)
94 ; CHECK-NEXT: movb %al, (%rdx)
95 ; CHECK-NEXT: vzeroupper
96 ; CHECK-NEXT: retq
5097 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5198 %2 = load <4 x i64>, <4 x i64>* %a
5299 %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %2, <4 x i64> %b)
60107
61108 define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) {
62109 ; CHECK-LABEL: stack_fold_vp2intersectd_128:
63 ; CHECK: vp2intersectd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
110 ; CHECK: # %bb.0:
111 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
112 ; CHECK-NEXT: #APP
113 ; CHECK-NEXT: nop
114 ; CHECK-NEXT: #NO_APP
115 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
116 ; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
117 ; CHECK-NEXT: kmovw %k1, %eax
118 ; CHECK-NEXT: kmovw %k0, %ecx
119 ; CHECK-NEXT: movb %cl, (%rsi)
120 ; CHECK-NEXT: movb %al, (%rdx)
121 ; CHECK-NEXT: retq
64122 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
65123 %2 = load <4 x i32>, <4 x i32>* %a
66124 %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %2, <4 x i32> %b)
74132
75133 define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>* nocapture %m0, <2 x i1>* nocapture %m1) {
76134 ; CHECK-LABEL: stack_fold_vp2intersectq_128:
77 ; CHECK: vp2intersectq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
135 ; CHECK: # %bb.0:
136 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
137 ; CHECK-NEXT: #APP
138 ; CHECK-NEXT: nop
139 ; CHECK-NEXT: #NO_APP
140 ; CHECK-NEXT: vmovaps (%rdi), %xmm0
141 ; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
142 ; CHECK-NEXT: kmovw %k1, %eax
143 ; CHECK-NEXT: kmovw %k0, %ecx
144 ; CHECK-NEXT: movb %cl, (%rsi)
145 ; CHECK-NEXT: movb %al, (%rdx)
146 ; CHECK-NEXT: retq
78147 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
79148 %2 = load <2 x i64>, <2 x i64>* %a
80149 %3 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %2, <2 x i64> %b)
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s
12
23 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
89 ; relevant registers and check that the reload is correctly folded into the instruction.
910
1011 define i32 @stack_fold_andn_u32(i32 %a0, i32 %a1) {
11 ;CHECK-LABEL: stack_fold_andn_u32
12 ;CHECK: andnl {{-?[0-9]*}}(%rsp), %eax, %eax {{.*#+}} 4-byte Folded Reload
12 ; CHECK-LABEL: stack_fold_andn_u32:
13 ; CHECK: # %bb.0:
14 ; CHECK-NEXT: pushq %rbp
15 ; CHECK-NEXT: .cfi_def_cfa_offset 16
16 ; CHECK-NEXT: pushq %r15
17 ; CHECK-NEXT: .cfi_def_cfa_offset 24
18 ; CHECK-NEXT: pushq %r14
19 ; CHECK-NEXT: .cfi_def_cfa_offset 32
20 ; CHECK-NEXT: pushq %r13
21 ; CHECK-NEXT: .cfi_def_cfa_offset 40
22 ; CHECK-NEXT: pushq %r12
23 ; CHECK-NEXT: .cfi_def_cfa_offset 48
24 ; CHECK-NEXT: pushq %rbx
25 ; CHECK-NEXT: .cfi_def_cfa_offset 56
26 ; CHECK-NEXT: .cfi_offset %rbx, -56
27 ; CHECK-NEXT: .cfi_offset %r12, -48
28 ; CHECK-NEXT: .cfi_offset %r13, -40
29 ; CHECK-NEXT: .cfi_offset %r14, -32
30 ; CHECK-NEXT: .cfi_offset %r15, -24
31 ; CHECK-NEXT: .cfi_offset %rbp, -16
32 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
33 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
34 ; CHECK-NEXT: #APP
35 ; CHECK-NEXT: nop
36 ; CHECK-NEXT: #NO_APP
37 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
38 ; CHECK-NEXT: andnl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload
39 ; CHECK-NEXT: popq %rbx
40 ; CHECK-NEXT: .cfi_def_cfa_offset 48
41 ; CHECK-NEXT: popq %r12
42 ; CHECK-NEXT: .cfi_def_cfa_offset 40
43 ; CHECK-NEXT: popq %r13
44 ; CHECK-NEXT: .cfi_def_cfa_offset 32
45 ; CHECK-NEXT: popq %r14
46 ; CHECK-NEXT: .cfi_def_cfa_offset 24
47 ; CHECK-NEXT: popq %r15
48 ; CHECK-NEXT: .cfi_def_cfa_offset 16
49 ; CHECK-NEXT: popq %rbp
50 ; CHECK-NEXT: .cfi_def_cfa_offset 8
51 ; CHECK-NEXT: retq
1352 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1453 %2 = xor i32 %a0, -1
1554 %3 = and i32 %a1, %2
1756 }
1857
1958 define i64 @stack_fold_andn_u64(i64 %a0, i64 %a1) {
20 ;CHECK-LABEL: stack_fold_andn_u64
21 ;CHECK: andnq {{-?[0-9]*}}(%rsp), %rax, %rax {{.*#+}} 8-byte Folded Reload
59 ; CHECK-LABEL: stack_fold_andn_u64:
60 ; CHECK: # %bb.0:
61 ; CHECK-NEXT: pushq %rbp
62 ; CHECK-NEXT: .cfi_def_cfa_offset 16
63 ; CHECK-NEXT: pushq %r15
64 ; CHECK-NEXT: .cfi_def_cfa_offset 24
65 ; CHECK-NEXT: pushq %r14
66 ; CHECK-NEXT: .cfi_def_cfa_offset 32
67 ; CHECK-NEXT: pushq %r13
68 ; CHECK-NEXT: .cfi_def_cfa_offset 40
69 ; CHECK-NEXT: pushq %r12
70 ; CHECK-NEXT: .cfi_def_cfa_offset 48
71 ; CHECK-NEXT: pushq %rbx
72 ; CHECK-NEXT: .cfi_def_cfa_offset 56
73 ; CHECK-NEXT: .cfi_offset %rbx, -56
74 ; CHECK-NEXT: .cfi_offset %r12, -48
75 ; CHECK-NEXT: .cfi_offset %r13, -40
76 ; CHECK-NEXT: .cfi_offset %r14, -32
77 ; CHECK-NEXT: .cfi_offset %r15, -24
78 ; CHECK-NEXT: .cfi_offset %rbp, -16
79 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
80 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
81 ; CHECK-NEXT: #APP
82 ; CHECK-NEXT: nop
83 ; CHECK-NEXT: #NO_APP
84 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
85 ; CHECK-NEXT: andnq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload
86 ; CHECK-NEXT: popq %rbx
87 ; CHECK-NEXT: .cfi_def_cfa_offset 48
88 ; CHECK-NEXT: popq %r12
89 ; CHECK-NEXT: .cfi_def_cfa_offset 40
90 ; CHECK-NEXT: popq %r13
91 ; CHECK-NEXT: .cfi_def_cfa_offset 32
92 ; CHECK-NEXT: popq %r14
93 ; CHECK-NEXT: .cfi_def_cfa_offset 24
94 ; CHECK-NEXT: popq %r15
95 ; CHECK-NEXT: .cfi_def_cfa_offset 16
96 ; CHECK-NEXT: popq %rbp
97 ; CHECK-NEXT: .cfi_def_cfa_offset 8
98 ; CHECK-NEXT: retq
2299 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
23100 %2 = xor i64 %a0, -1
24101 %3 = and i64 %a1, %2
26103 }
27104
28105 define i32 @stack_fold_bextr_u32(i32 %a0, i32 %a1) {
29 ;CHECK-LABEL: stack_fold_bextr_u32
30 ;CHECK: # %bb.0:
31 ;CHECK: bextrl %eax, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
106 ; CHECK-LABEL: stack_fold_bextr_u32:
107 ; CHECK: # %bb.0:
108 ; CHECK-NEXT: pushq %rbp
109 ; CHECK-NEXT: .cfi_def_cfa_offset 16
110 ; CHECK-NEXT: pushq %r15
111 ; CHECK-NEXT: .cfi_def_cfa_offset 24
112 ; CHECK-NEXT: pushq %r14
113 ; CHECK-NEXT: .cfi_def_cfa_offset 32
114 ; CHECK-NEXT: pushq %r13
115 ; CHECK-NEXT: .cfi_def_cfa_offset 40
116 ; CHECK-NEXT: pushq %r12
117 ; CHECK-NEXT: .cfi_def_cfa_offset 48
118 ; CHECK-NEXT: pushq %rbx
119 ; CHECK-NEXT: .cfi_def_cfa_offset 56
120 ; CHECK-NEXT: .cfi_offset %rbx, -56
121 ; CHECK-NEXT: .cfi_offset %r12, -48
122 ; CHECK-NEXT: .cfi_offset %r13, -40
123 ; CHECK-NEXT: .cfi_offset %r14, -32
124 ; CHECK-NEXT: .cfi_offset %r15, -24
125 ; CHECK-NEXT: .cfi_offset %rbp, -16
126 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
127 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
128 ; CHECK-NEXT: #APP
129 ; CHECK-NEXT: nop
130 ; CHECK-NEXT: #NO_APP
131 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
132 ; CHECK-NEXT: bextrl %eax, {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
133 ; CHECK-NEXT: popq %rbx
134 ; CHECK-NEXT: .cfi_def_cfa_offset 48
135 ; CHECK-NEXT: popq %r12
136 ; CHECK-NEXT: .cfi_def_cfa_offset 40
137 ; CHECK-NEXT: popq %r13
138 ; CHECK-NEXT: .cfi_def_cfa_offset 32
139 ; CHECK-NEXT: popq %r14
140 ; CHECK-NEXT: .cfi_def_cfa_offset 24
141 ; CHECK-NEXT: popq %r15
142 ; CHECK-NEXT: .cfi_def_cfa_offset 16
143 ; CHECK-NEXT: popq %rbp
144 ; CHECK-NEXT: .cfi_def_cfa_offset 8
145 ; CHECK-NEXT: retq
32146 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
33147 %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %a1)
34148 ret i32 %2
36150 declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
37151
38152 define i64 @stack_fold_bextr_u64(i64 %a0, i64 %a1) {
39 ;CHECK-LABEL: stack_fold_bextr_u64
40 ;CHECK: # %bb.0:
41 ;CHECK: bextrq %rax, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
153 ; CHECK-LABEL: stack_fold_bextr_u64:
154 ; CHECK: # %bb.0:
155 ; CHECK-NEXT: pushq %rbp
156 ; CHECK-NEXT: .cfi_def_cfa_offset 16
157 ; CHECK-NEXT: pushq %r15
158 ; CHECK-NEXT: .cfi_def_cfa_offset 24
159 ; CHECK-NEXT: pushq %r14
160 ; CHECK-NEXT: .cfi_def_cfa_offset 32
161 ; CHECK-NEXT: pushq %r13
162 ; CHECK-NEXT: .cfi_def_cfa_offset 40
163 ; CHECK-NEXT: pushq %r12
164 ; CHECK-NEXT: .cfi_def_cfa_offset 48
165 ; CHECK-NEXT: pushq %rbx
166 ; CHECK-NEXT: .cfi_def_cfa_offset 56
167 ; CHECK-NEXT: .cfi_offset %rbx, -56
168 ; CHECK-NEXT: .cfi_offset %r12, -48
169 ; CHECK-NEXT: .cfi_offset %r13, -40
170 ; CHECK-NEXT: .cfi_offset %r14, -32
171 ; CHECK-NEXT: .cfi_offset %r15, -24
172 ; CHECK-NEXT: .cfi_offset %rbp, -16
173 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
174 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
175 ; CHECK-NEXT: #APP
176 ; CHECK-NEXT: nop
177 ; CHECK-NEXT: #NO_APP
178 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
179 ; CHECK-NEXT: bextrq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
180 ; CHECK-NEXT: popq %rbx
181 ; CHECK-NEXT: .cfi_def_cfa_offset 48
182 ; CHECK-NEXT: popq %r12
183 ; CHECK-NEXT: .cfi_def_cfa_offset 40
184 ; CHECK-NEXT: popq %r13
185 ; CHECK-NEXT: .cfi_def_cfa_offset 32
186 ; CHECK-NEXT: popq %r14
187 ; CHECK-NEXT: .cfi_def_cfa_offset 24
188 ; CHECK-NEXT: popq %r15
189 ; CHECK-NEXT: .cfi_def_cfa_offset 16
190 ; CHECK-NEXT: popq %rbp
191 ; CHECK-NEXT: .cfi_def_cfa_offset 8
192 ; CHECK-NEXT: retq
42193 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
43194 %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %a1)
44195 ret i64 %2
46197 declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
47198
48199 define i32 @stack_fold_blsi_u32(i32 %a0) {
49 ;CHECK-LABEL: stack_fold_blsi_u32
50 ;CHECK: blsil {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
200 ; CHECK-LABEL: stack_fold_blsi_u32:
201 ; CHECK: # %bb.0:
202 ; CHECK-NEXT: pushq %rbp
203 ; CHECK-NEXT: .cfi_def_cfa_offset 16
204 ; CHECK-NEXT: pushq %r15
205 ; CHECK-NEXT: .cfi_def_cfa_offset 24
206 ; CHECK-NEXT: pushq %r14
207 ; CHECK-NEXT: .cfi_def_cfa_offset 32
208 ; CHECK-NEXT: pushq %r13
209 ; CHECK-NEXT: .cfi_def_cfa_offset 40
210 ; CHECK-NEXT: pushq %r12
211 ; CHECK-NEXT: .cfi_def_cfa_offset 48
212 ; CHECK-NEXT: pushq %rbx
213 ; CHECK-NEXT: .cfi_def_cfa_offset 56
214 ; CHECK-NEXT: .cfi_offset %rbx, -56
215 ; CHECK-NEXT: .cfi_offset %r12, -48
216 ; CHECK-NEXT: .cfi_offset %r13, -40
217 ; CHECK-NEXT: .cfi_offset %r14, -32
218 ; CHECK-NEXT: .cfi_offset %r15, -24
219 ; CHECK-NEXT: .cfi_offset %rbp, -16
220 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
221 ; CHECK-NEXT: #APP
222 ; CHECK-NEXT: nop
223 ; CHECK-NEXT: #NO_APP
224 ; CHECK-NEXT: blsil {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
225 ; CHECK-NEXT: popq %rbx
226 ; CHECK-NEXT: .cfi_def_cfa_offset 48
227 ; CHECK-NEXT: popq %r12
228 ; CHECK-NEXT: .cfi_def_cfa_offset 40
229 ; CHECK-NEXT: popq %r13
230 ; CHECK-NEXT: .cfi_def_cfa_offset 32
231 ; CHECK-NEXT: popq %r14
232 ; CHECK-NEXT: .cfi_def_cfa_offset 24
233 ; CHECK-NEXT: popq %r15
234 ; CHECK-NEXT: .cfi_def_cfa_offset 16
235 ; CHECK-NEXT: popq %rbp
236 ; CHECK-NEXT: .cfi_def_cfa_offset 8
237 ; CHECK-NEXT: retq
51238 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
52239 %2 = sub i32 0, %a0
53240 %3 = and i32 %2, %a0
55242 }
56243
57244 define i64 @stack_fold_blsi_u64(i64 %a0) {
58 ;CHECK-LABEL: stack_fold_blsi_u64
59 ;CHECK: blsiq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
245 ; CHECK-LABEL: stack_fold_blsi_u64:
246 ; CHECK: # %bb.0:
247 ; CHECK-NEXT: pushq %rbp
248 ; CHECK-NEXT: .cfi_def_cfa_offset 16
249 ; CHECK-NEXT: pushq %r15
250 ; CHECK-NEXT: .cfi_def_cfa_offset 24
251 ; CHECK-NEXT: pushq %r14
252 ; CHECK-NEXT: .cfi_def_cfa_offset 32
253 ; CHECK-NEXT: pushq %r13
254 ; CHECK-NEXT: .cfi_def_cfa_offset 40
255 ; CHECK-NEXT: pushq %r12
256 ; CHECK-NEXT: .cfi_def_cfa_offset 48
257 ; CHECK-NEXT: pushq %rbx
258 ; CHECK-NEXT: .cfi_def_cfa_offset 56
259 ; CHECK-NEXT: .cfi_offset %rbx, -56
260 ; CHECK-NEXT: .cfi_offset %r12, -48
261 ; CHECK-NEXT: .cfi_offset %r13, -40
262 ; CHECK-NEXT: .cfi_offset %r14, -32
263 ; CHECK-NEXT: .cfi_offset %r15, -24
264 ; CHECK-NEXT: .cfi_offset %rbp, -16
265 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
266 ; CHECK-NEXT: #APP
267 ; CHECK-NEXT: nop
268 ; CHECK-NEXT: #NO_APP
269 ; CHECK-NEXT: blsiq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
270 ; CHECK-NEXT: popq %rbx
271 ; CHECK-NEXT: .cfi_def_cfa_offset 48
272 ; CHECK-NEXT: popq %r12
273 ; CHECK-NEXT: .cfi_def_cfa_offset 40
274 ; CHECK-NEXT: popq %r13
275 ; CHECK-NEXT: .cfi_def_cfa_offset 32
276 ; CHECK-NEXT: popq %r14
277 ; CHECK-NEXT: .cfi_def_cfa_offset 24
278 ; CHECK-NEXT: popq %r15
279 ; CHECK-NEXT: .cfi_def_cfa_offset 16
280 ; CHECK-NEXT: popq %rbp
281 ; CHECK-NEXT: .cfi_def_cfa_offset 8
282 ; CHECK-NEXT: retq
60283 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
61284 %2 = sub i64 0, %a0
62285 %3 = and i64 %2, %a0
64287 }
65288
66289 define i32 @stack_fold_blsmsk_u32(i32 %a0) {
67 ;CHECK-LABEL: stack_fold_blsmsk_u32
68 ;CHECK: blsmskl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
290 ; CHECK-LABEL: stack_fold_blsmsk_u32:
291 ; CHECK: # %bb.0:
292 ; CHECK-NEXT: pushq %rbp
293 ; CHECK-NEXT: .cfi_def_cfa_offset 16
294 ; CHECK-NEXT: pushq %r15
295 ; CHECK-NEXT: .cfi_def_cfa_offset 24
296 ; CHECK-NEXT: pushq %r14
297 ; CHECK-NEXT: .cfi_def_cfa_offset 32
298 ; CHECK-NEXT: pushq %r13
299 ; CHECK-NEXT: .cfi_def_cfa_offset 40
300 ; CHECK-NEXT: pushq %r12
301 ; CHECK-NEXT: .cfi_def_cfa_offset 48
302 ; CHECK-NEXT: pushq %rbx
303 ; CHECK-NEXT: .cfi_def_cfa_offset 56
304 ; CHECK-NEXT: .cfi_offset %rbx, -56
305 ; CHECK-NEXT: .cfi_offset %r12, -48
306 ; CHECK-NEXT: .cfi_offset %r13, -40
307 ; CHECK-NEXT: .cfi_offset %r14, -32
308 ; CHECK-NEXT: .cfi_offset %r15, -24
309 ; CHECK-NEXT: .cfi_offset %rbp, -16
310 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
311 ; CHECK-NEXT: #APP
312 ; CHECK-NEXT: nop
313 ; CHECK-NEXT: #NO_APP
314 ; CHECK-NEXT: blsmskl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
315 ; CHECK-NEXT: popq %rbx
316 ; CHECK-NEXT: .cfi_def_cfa_offset 48
317 ; CHECK-NEXT: popq %r12
318 ; CHECK-NEXT: .cfi_def_cfa_offset 40
319 ; CHECK-NEXT: popq %r13
320 ; CHECK-NEXT: .cfi_def_cfa_offset 32
321 ; CHECK-NEXT: popq %r14
322 ; CHECK-NEXT: .cfi_def_cfa_offset 24
323 ; CHECK-NEXT: popq %r15
324 ; CHECK-NEXT: .cfi_def_cfa_offset 16
325 ; CHECK-NEXT: popq %rbp
326 ; CHECK-NEXT: .cfi_def_cfa_offset 8
327 ; CHECK-NEXT: retq
69328 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
70329 %2 = sub i32 %a0, 1
71330 %3 = xor i32 %2, %a0
73332 }
74333
75334 define i64 @stack_fold_blsmsk_u64(i64 %a0) {
76 ;CHECK-LABEL: stack_fold_blsmsk_u64
77 ;CHECK: blsmskq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
335 ; CHECK-LABEL: stack_fold_blsmsk_u64:
336 ; CHECK: # %bb.0:
337 ; CHECK-NEXT: pushq %rbp
338 ; CHECK-NEXT: .cfi_def_cfa_offset 16
339 ; CHECK-NEXT: pushq %r15
340 ; CHECK-NEXT: .cfi_def_cfa_offset 24
341 ; CHECK-NEXT: pushq %r14
342 ; CHECK-NEXT: .cfi_def_cfa_offset 32
343 ; CHECK-NEXT: pushq %r13
344 ; CHECK-NEXT: .cfi_def_cfa_offset 40
345 ; CHECK-NEXT: pushq %r12
346 ; CHECK-NEXT: .cfi_def_cfa_offset 48
347 ; CHECK-NEXT: pushq %rbx
348 ; CHECK-NEXT: .cfi_def_cfa_offset 56
349 ; CHECK-NEXT: .cfi_offset %rbx, -56
350 ; CHECK-NEXT: .cfi_offset %r12, -48
351 ; CHECK-NEXT: .cfi_offset %r13, -40
352 ; CHECK-NEXT: .cfi_offset %r14, -32
353 ; CHECK-NEXT: .cfi_offset %r15, -24
354 ; CHECK-NEXT: .cfi_offset %rbp, -16
355 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
356 ; CHECK-NEXT: #APP
357 ; CHECK-NEXT: nop
358 ; CHECK-NEXT: #NO_APP
359 ; CHECK-NEXT: blsmskq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
360 ; CHECK-NEXT: popq %rbx
361 ; CHECK-NEXT: .cfi_def_cfa_offset 48
362 ; CHECK-NEXT: popq %r12
363 ; CHECK-NEXT: .cfi_def_cfa_offset 40
364 ; CHECK-NEXT: popq %r13
365 ; CHECK-NEXT: .cfi_def_cfa_offset 32
366 ; CHECK-NEXT: popq %r14
367 ; CHECK-NEXT: .cfi_def_cfa_offset 24
368 ; CHECK-NEXT: popq %r15
369 ; CHECK-NEXT: .cfi_def_cfa_offset 16
370 ; CHECK-NEXT: popq %rbp
371 ; CHECK-NEXT: .cfi_def_cfa_offset 8
372 ; CHECK-NEXT: retq
78373 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
79374 %2 = sub i64 %a0, 1
80375 %3 = xor i64 %2, %a0
82377 }
83378
84379 define i32 @stack_fold_blsr_u32(i32 %a0) {
85 ;CHECK-LABEL: stack_fold_blsr_u32
86 ;CHECK: blsrl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
380 ; CHECK-LABEL: stack_fold_blsr_u32:
381 ; CHECK: # %bb.0:
382 ; CHECK-NEXT: pushq %rbp
383 ; CHECK-NEXT: .cfi_def_cfa_offset 16
384 ; CHECK-NEXT: pushq %r15
385 ; CHECK-NEXT: .cfi_def_cfa_offset 24
386 ; CHECK-NEXT: pushq %r14
387 ; CHECK-NEXT: .cfi_def_cfa_offset 32
388 ; CHECK-NEXT: pushq %r13
389 ; CHECK-NEXT: .cfi_def_cfa_offset 40
390 ; CHECK-NEXT: pushq %r12
391 ; CHECK-NEXT: .cfi_def_cfa_offset 48
392 ; CHECK-NEXT: pushq %rbx
393 ; CHECK-NEXT: .cfi_def_cfa_offset 56
394 ; CHECK-NEXT: .cfi_offset %rbx, -56
395 ; CHECK-NEXT: .cfi_offset %r12, -48
396 ; CHECK-NEXT: .cfi_offset %r13, -40
397 ; CHECK-NEXT: .cfi_offset %r14, -32
398 ; CHECK-NEXT: .cfi_offset %r15, -24
399 ; CHECK-NEXT: .cfi_offset %rbp, -16
400 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
401 ; CHECK-NEXT: #APP
402 ; CHECK-NEXT: nop
403 ; CHECK-NEXT: #NO_APP
404 ; CHECK-NEXT: blsrl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
405 ; CHECK-NEXT: popq %rbx
406 ; CHECK-NEXT: .cfi_def_cfa_offset 48
407 ; CHECK-NEXT: popq %r12
408 ; CHECK-NEXT: .cfi_def_cfa_offset 40
409 ; CHECK-NEXT: popq %r13
410 ; CHECK-NEXT: .cfi_def_cfa_offset 32
411 ; CHECK-NEXT: popq %r14
412 ; CHECK-NEXT: .cfi_def_cfa_offset 24
413 ; CHECK-NEXT: popq %r15
414 ; CHECK-NEXT: .cfi_def_cfa_offset 16
415 ; CHECK-NEXT: popq %rbp
416 ; CHECK-NEXT: .cfi_def_cfa_offset 8
417 ; CHECK-NEXT: retq
87418 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
88419 %2 = sub i32 %a0, 1
89420 %3 = and i32 %2, %a0
91422 }
92423
93424 define i64 @stack_fold_blsr_u64(i64 %a0) {
94 ;CHECK-LABEL: stack_fold_blsr_u64
95 ;CHECK: blsrq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
425 ; CHECK-LABEL: stack_fold_blsr_u64:
426 ; CHECK: # %bb.0:
427 ; CHECK-NEXT: pushq %rbp
428 ; CHECK-NEXT: .cfi_def_cfa_offset 16
429 ; CHECK-NEXT: pushq %r15
430 ; CHECK-NEXT: .cfi_def_cfa_offset 24
431 ; CHECK-NEXT: pushq %r14
432 ; CHECK-NEXT: .cfi_def_cfa_offset 32
433 ; CHECK-NEXT: pushq %r13
434 ; CHECK-NEXT: .cfi_def_cfa_offset 40
435 ; CHECK-NEXT: pushq %r12
436 ; CHECK-NEXT: .cfi_def_cfa_offset 48
437 ; CHECK-NEXT: pushq %rbx
438 ; CHECK-NEXT: .cfi_def_cfa_offset 56
439 ; CHECK-NEXT: .cfi_offset %rbx, -56
440 ; CHECK-NEXT: .cfi_offset %r12, -48
441 ; CHECK-NEXT: .cfi_offset %r13, -40
442 ; CHECK-NEXT: .cfi_offset %r14, -32
443 ; CHECK-NEXT: .cfi_offset %r15, -24
444 ; CHECK-NEXT: .cfi_offset %rbp, -16
445 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
446 ; CHECK-NEXT: #APP
447 ; CHECK-NEXT: nop
448 ; CHECK-NEXT: #NO_APP
449 ; CHECK-NEXT: blsrq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
450 ; CHECK-NEXT: popq %rbx
451 ; CHECK-NEXT: .cfi_def_cfa_offset 48
452 ; CHECK-NEXT: popq %r12
453 ; CHECK-NEXT: .cfi_def_cfa_offset 40
454 ; CHECK-NEXT: popq %r13
455 ; CHECK-NEXT: .cfi_def_cfa_offset 32
456 ; CHECK-NEXT: popq %r14
457 ; CHECK-NEXT: .cfi_def_cfa_offset 24
458 ; CHECK-NEXT: popq %r15
459 ; CHECK-NEXT: .cfi_def_cfa_offset 16
460 ; CHECK-NEXT: popq %rbp
461 ; CHECK-NEXT: .cfi_def_cfa_offset 8
462 ; CHECK-NEXT: retq
96463 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
97464 %2 = sub i64 %a0, 1
98465 %3 = and i64 %2, %a0
102469 ;TODO stack_fold_tzcnt_u16
103470
104471 define i32 @stack_fold_tzcnt_u32(i32 %a0) {
105 ;CHECK-LABEL: stack_fold_tzcnt_u32
106 ;CHECK: tzcntl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
472 ; CHECK-LABEL: stack_fold_tzcnt_u32:
473 ; CHECK: # %bb.0:
474 ; CHECK-NEXT: pushq %rbp
475 ; CHECK-NEXT: .cfi_def_cfa_offset 16
476 ; CHECK-NEXT: pushq %r15
477 ; CHECK-NEXT: .cfi_def_cfa_offset 24
478 ; CHECK-NEXT: pushq %r14
479 ; CHECK-NEXT: .cfi_def_cfa_offset 32
480 ; CHECK-NEXT: pushq %r13
481 ; CHECK-NEXT: .cfi_def_cfa_offset 40
482 ; CHECK-NEXT: pushq %r12
483 ; CHECK-NEXT: .cfi_def_cfa_offset 48
484 ; CHECK-NEXT: pushq %rbx
485 ; CHECK-NEXT: .cfi_def_cfa_offset 56
486 ; CHECK-NEXT: .cfi_offset %rbx, -56
487 ; CHECK-NEXT: .cfi_offset %r12, -48
488 ; CHECK-NEXT: .cfi_offset %r13, -40
489 ; CHECK-NEXT: .cfi_offset %r14, -32
490 ; CHECK-NEXT: .cfi_offset %r15, -24
491 ; CHECK-NEXT: .cfi_offset %rbp, -16
492 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
493 ; CHECK-NEXT: #APP
494 ; CHECK-NEXT: nop
495 ; CHECK-NEXT: #NO_APP
496 ; CHECK-NEXT: tzcntl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
497 ; CHECK-NEXT: popq %rbx
498 ; CHECK-NEXT: .cfi_def_cfa_offset 48
499 ; CHECK-NEXT: popq %r12
500 ; CHECK-NEXT: .cfi_def_cfa_offset 40
501 ; CHECK-NEXT: popq %r13
502 ; CHECK-NEXT: .cfi_def_cfa_offset 32
503 ; CHECK-NEXT: popq %r14
504 ; CHECK-NEXT: .cfi_def_cfa_offset 24
505 ; CHECK-NEXT: popq %r15
506 ; CHECK-NEXT: .cfi_def_cfa_offset 16
507 ; CHECK-NEXT: popq %rbp
508 ; CHECK-NEXT: .cfi_def_cfa_offset 8
509 ; CHECK-NEXT: retq
107510 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
108511 %2 = tail call i32 @llvm.cttz.i32(i32 %a0, i1 0)
109512 ret i32 %2
111514 declare i32 @llvm.cttz.i32(i32, i1)
112515
113516 define i64 @stack_fold_tzcnt_u64(i64 %a0) {
114 ;CHECK-LABEL: stack_fold_tzcnt_u64
115 ;CHECK: tzcntq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
517 ; CHECK-LABEL: stack_fold_tzcnt_u64:
518 ; CHECK: # %bb.0:
519 ; CHECK-NEXT: pushq %rbp
520 ; CHECK-NEXT: .cfi_def_cfa_offset 16
521 ; CHECK-NEXT: pushq %r15
522 ; CHECK-NEXT: .cfi_def_cfa_offset 24
523 ; CHECK-NEXT: pushq %r14
524 ; CHECK-NEXT: .cfi_def_cfa_offset 32
525 ; CHECK-NEXT: pushq %r13
526 ; CHECK-NEXT: .cfi_def_cfa_offset 40
527 ; CHECK-NEXT: pushq %r12
528 ; CHECK-NEXT: .cfi_def_cfa_offset 48
529 ; CHECK-NEXT: pushq %rbx
530 ; CHECK-NEXT: .cfi_def_cfa_offset 56
531 ; CHECK-NEXT: .cfi_offset %rbx, -56
532 ; CHECK-NEXT: .cfi_offset %r12, -48
533 ; CHECK-NEXT: .cfi_offset %r13, -40
534 ; CHECK-NEXT: .cfi_offset %r14, -32
535 ; CHECK-NEXT: .cfi_offset %r15, -24
536 ; CHECK-NEXT: .cfi_offset %rbp, -16
537 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
538 ; CHECK-NEXT: #APP
539 ; CHECK-NEXT: nop
540 ; CHECK-NEXT: #NO_APP
541 ; CHECK-NEXT: tzcntq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
542 ; CHECK-NEXT: popq %rbx
543 ; CHECK-NEXT: .cfi_def_cfa_offset 48
544 ; CHECK-NEXT: popq %r12
545 ; CHECK-NEXT: .cfi_def_cfa_offset 40
546 ; CHECK-NEXT: popq %r13
547 ; CHECK-NEXT: .cfi_def_cfa_offset 32
548 ; CHECK-NEXT: popq %r14
549 ; CHECK-NEXT: .cfi_def_cfa_offset 24
550 ; CHECK-NEXT: popq %r15
551 ; CHECK-NEXT: .cfi_def_cfa_offset 16
552 ; CHECK-NEXT: popq %rbp
553 ; CHECK-NEXT: .cfi_def_cfa_offset 8
554 ; CHECK-NEXT: retq
116555 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
117556 %2 = tail call i64 @llvm.cttz.i64(i64 %a0, i1 0)
118557 ret i64 %2
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 < %s | FileCheck %s
12
23 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
89 ; relevant registers and check that the reload is correctly folded into the instruction.
910
1011 define i32 @stack_fold_bzhi_u32(i32 %a0, i32 %a1) {
11 ;CHECK-LABEL: stack_fold_bzhi_u32
12 ;CHECK: bzhil %eax, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
12 ; CHECK-LABEL: stack_fold_bzhi_u32:
13 ; CHECK: # %bb.0:
14 ; CHECK-NEXT: pushq %rbp
15 ; CHECK-NEXT: .cfi_def_cfa_offset 16
16 ; CHECK-NEXT: pushq %r15
17 ; CHECK-NEXT: .cfi_def_cfa_offset 24
18 ; CHECK-NEXT: pushq %r14
19 ; CHECK-NEXT: .cfi_def_cfa_offset 32
20 ; CHECK-NEXT: pushq %r13
21 ; CHECK-NEXT: .cfi_def_cfa_offset 40
22 ; CHECK-NEXT: pushq %r12
23 ; CHECK-NEXT: .cfi_def_cfa_offset 48
24 ; CHECK-NEXT: pushq %rbx
25 ; CHECK-NEXT: .cfi_def_cfa_offset 56
26 ; CHECK-NEXT: .cfi_offset %rbx, -56
27 ; CHECK-NEXT: .cfi_offset %r12, -48
28 ; CHECK-NEXT: .cfi_offset %r13, -40
29 ; CHECK-NEXT: .cfi_offset %r14, -32
30 ; CHECK-NEXT: .cfi_offset %r15, -24
31 ; CHECK-NEXT: .cfi_offset %rbp, -16
32 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
33 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
34 ; CHECK-NEXT: #APP
35 ; CHECK-NEXT: nop
36 ; CHECK-NEXT: #NO_APP
37 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
38 ; CHECK-NEXT: bzhil %eax, {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
39 ; CHECK-NEXT: popq %rbx
40 ; CHECK-NEXT: .cfi_def_cfa_offset 48
41 ; CHECK-NEXT: popq %r12
42 ; CHECK-NEXT: .cfi_def_cfa_offset 40
43 ; CHECK-NEXT: popq %r13
44 ; CHECK-NEXT: .cfi_def_cfa_offset 32
45 ; CHECK-NEXT: popq %r14
46 ; CHECK-NEXT: .cfi_def_cfa_offset 24
47 ; CHECK-NEXT: popq %r15
48 ; CHECK-NEXT: .cfi_def_cfa_offset 16
49 ; CHECK-NEXT: popq %rbp
50 ; CHECK-NEXT: .cfi_def_cfa_offset 8
51 ; CHECK-NEXT: retq
1352 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
1453 %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a0, i32 %a1)
1554 ret i32 %2
1756 declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
1857
1958 define i64 @stack_fold_bzhi_u64(i64 %a0, i64 %a1) {
20 ;CHECK-LABEL: stack_fold_bzhi_u64
21 ;CHECK: bzhiq %rax, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
59 ; CHECK-LABEL: stack_fold_bzhi_u64:
60 ; CHECK: # %bb.0:
61 ; CHECK-NEXT: pushq %rbp
62 ; CHECK-NEXT: .cfi_def_cfa_offset 16
63 ; CHECK-NEXT: pushq %r15
64 ; CHECK-NEXT: .cfi_def_cfa_offset 24
65 ; CHECK-NEXT: pushq %r14
66 ; CHECK-NEXT: .cfi_def_cfa_offset 32
67 ; CHECK-NEXT: pushq %r13
68 ; CHECK-NEXT: .cfi_def_cfa_offset 40
69 ; CHECK-NEXT: pushq %r12
70 ; CHECK-NEXT: .cfi_def_cfa_offset 48
71 ; CHECK-NEXT: pushq %rbx
72 ; CHECK-NEXT: .cfi_def_cfa_offset 56
73 ; CHECK-NEXT: .cfi_offset %rbx, -56
74 ; CHECK-NEXT: .cfi_offset %r12, -48
75 ; CHECK-NEXT: .cfi_offset %r13, -40
76 ; CHECK-NEXT: .cfi_offset %r14, -32
77 ; CHECK-NEXT: .cfi_offset %r15, -24
78 ; CHECK-NEXT: .cfi_offset %rbp, -16
79 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
80 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
81 ; CHECK-NEXT: #APP
82 ; CHECK-NEXT: nop
83 ; CHECK-NEXT: #NO_APP
84 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
85 ; CHECK-NEXT: bzhiq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
86 ; CHECK-NEXT: popq %rbx
87 ; CHECK-NEXT: .cfi_def_cfa_offset 48
88 ; CHECK-NEXT: popq %r12
89 ; CHECK-NEXT: .cfi_def_cfa_offset 40
90 ; CHECK-NEXT: popq %r13
91 ; CHECK-NEXT: .cfi_def_cfa_offset 32
92 ; CHECK-NEXT: popq %r14
93 ; CHECK-NEXT: .cfi_def_cfa_offset 24
94 ; CHECK-NEXT: popq %r15
95 ; CHECK-NEXT: .cfi_def_cfa_offset 16
96 ; CHECK-NEXT: popq %rbp
97 ; CHECK-NEXT: .cfi_def_cfa_offset 8
98 ; CHECK-NEXT: retq
2299 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
23100 %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a0, i64 %a1)
24101 ret i64 %2
26103 declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
27104
28105 define i32 @stack_fold_pdep_u32(i32 %a0, i32 %a1) {
29 ;CHECK-LABEL: stack_fold_pdep_u32
30 ;CHECK: pdepl {{-?[0-9]*}}(%rsp), %eax, %eax {{.*#+}} 4-byte Folded Reload
106 ; CHECK-LABEL: stack_fold_pdep_u32:
107 ; CHECK: # %bb.0:
108 ; CHECK-NEXT: pushq %rbp
109 ; CHECK-NEXT: .cfi_def_cfa_offset 16
110 ; CHECK-NEXT: pushq %r15
111 ; CHECK-NEXT: .cfi_def_cfa_offset 24
112 ; CHECK-NEXT: pushq %r14
113 ; CHECK-NEXT: .cfi_def_cfa_offset 32
114 ; CHECK-NEXT: pushq %r13
115 ; CHECK-NEXT: .cfi_def_cfa_offset 40
116 ; CHECK-NEXT: pushq %r12
117 ; CHECK-NEXT: .cfi_def_cfa_offset 48
118 ; CHECK-NEXT: pushq %rbx
119 ; CHECK-NEXT: .cfi_def_cfa_offset 56
120 ; CHECK-NEXT: .cfi_offset %rbx, -56
121 ; CHECK-NEXT: .cfi_offset %r12, -48
122 ; CHECK-NEXT: .cfi_offset %r13, -40
123 ; CHECK-NEXT: .cfi_offset %r14, -32
124 ; CHECK-NEXT: .cfi_offset %r15, -24
125 ; CHECK-NEXT: .cfi_offset %rbp, -16
126 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
127 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
128 ; CHECK-NEXT: #APP
129 ; CHECK-NEXT: nop
130 ; CHECK-NEXT: #NO_APP
131 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
132 ; CHECK-NEXT: pdepl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload
133 ; CHECK-NEXT: popq %rbx
134 ; CHECK-NEXT: .cfi_def_cfa_offset 48
135 ; CHECK-NEXT: popq %r12
136 ; CHECK-NEXT: .cfi_def_cfa_offset 40
137 ; CHECK-NEXT: popq %r13
138 ; CHECK-NEXT: .cfi_def_cfa_offset 32
139 ; CHECK-NEXT: popq %r14
140 ; CHECK-NEXT: .cfi_def_cfa_offset 24
141 ; CHECK-NEXT: popq %r15
142 ; CHECK-NEXT: .cfi_def_cfa_offset 16
143 ; CHECK-NEXT: popq %rbp
144 ; CHECK-NEXT: .cfi_def_cfa_offset 8
145 ; CHECK-NEXT: retq
31146 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
32147 %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
33148 ret i32 %2
35150 declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
36151
37152 define i64 @stack_fold_pdep_u64(i64 %a0, i64 %a1) {
38 ;CHECK-LABEL: stack_fold_pdep_u64
39 ;CHECK: pdepq {{-?[0-9]*}}(%rsp), %rax, %rax {{.*#+}} 8-byte Folded Reload
153 ; CHECK-LABEL: stack_fold_pdep_u64:
154 ; CHECK: # %bb.0:
155 ; CHECK-NEXT: pushq %rbp
156 ; CHECK-NEXT: .cfi_def_cfa_offset 16
157 ; CHECK-NEXT: pushq %r15
158 ; CHECK-NEXT: .cfi_def_cfa_offset 24
159 ; CHECK-NEXT: pushq %r14
160 ; CHECK-NEXT: .cfi_def_cfa_offset 32
161 ; CHECK-NEXT: pushq %r13
162 ; CHECK-NEXT: .cfi_def_cfa_offset 40
163 ; CHECK-NEXT: pushq %r12
164 ; CHECK-NEXT: .cfi_def_cfa_offset 48
165 ; CHECK-NEXT: pushq %rbx
166 ; CHECK-NEXT: .cfi_def_cfa_offset 56
167 ; CHECK-NEXT: .cfi_offset %rbx, -56
168 ; CHECK-NEXT: .cfi_offset %r12, -48
169 ; CHECK-NEXT: .cfi_offset %r13, -40
170 ; CHECK-NEXT: .cfi_offset %r14, -32
171 ; CHECK-NEXT: .cfi_offset %r15, -24
172 ; CHECK-NEXT: .cfi_offset %rbp, -16
173 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
174 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
175 ; CHECK-NEXT: #APP
176 ; CHECK-NEXT: nop
177 ; CHECK-NEXT: #NO_APP
178 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
179 ; CHECK-NEXT: pdepq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload
180 ; CHECK-NEXT: popq %rbx
181 ; CHECK-NEXT: .cfi_def_cfa_offset 48
182 ; CHECK-NEXT: popq %r12
183 ; CHECK-NEXT: .cfi_def_cfa_offset 40
184 ; CHECK-NEXT: popq %r13
185 ; CHECK-NEXT: .cfi_def_cfa_offset 32
186 ; CHECK-NEXT: popq %r14
187 ; CHECK-NEXT: .cfi_def_cfa_offset 24
188 ; CHECK-NEXT: popq %r15
189 ; CHECK-NEXT: .cfi_def_cfa_offset 16
190 ; CHECK-NEXT: popq %rbp
191 ; CHECK-NEXT: .cfi_def_cfa_offset 8
192 ; CHECK-NEXT: retq
40193 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
41194 %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
42195 ret i64 %2
44197 declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
45198
46199 define i32 @stack_fold_pext_u32(i32 %a0, i32 %a1) {
47 ;CHECK-LABEL: stack_fold_pext_u32
48 ;CHECK: pextl {{-?[0-9]*}}(%rsp), %eax, %eax {{.*#+}} 4-byte Folded Reload
200 ; CHECK-LABEL: stack_fold_pext_u32:
201 ; CHECK: # %bb.0:
202 ; CHECK-NEXT: pushq %rbp
203 ; CHECK-NEXT: .cfi_def_cfa_offset 16
204 ; CHECK-NEXT: pushq %r15
205 ; CHECK-NEXT: .cfi_def_cfa_offset 24
206 ; CHECK-NEXT: pushq %r14
207 ; CHECK-NEXT: .cfi_def_cfa_offset 32
208 ; CHECK-NEXT: pushq %r13
209 ; CHECK-NEXT: .cfi_def_cfa_offset 40
210 ; CHECK-NEXT: pushq %r12
211 ; CHECK-NEXT: .cfi_def_cfa_offset 48
212 ; CHECK-NEXT: pushq %rbx
213 ; CHECK-NEXT: .cfi_def_cfa_offset 56
214 ; CHECK-NEXT: .cfi_offset %rbx, -56
215 ; CHECK-NEXT: .cfi_offset %r12, -48
216 ; CHECK-NEXT: .cfi_offset %r13, -40
217 ; CHECK-NEXT: .cfi_offset %r14, -32
218 ; CHECK-NEXT: .cfi_offset %r15, -24
219 ; CHECK-NEXT: .cfi_offset %rbp, -16
220 ; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
221 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
222 ; CHECK-NEXT: #APP
223 ; CHECK-NEXT: nop
224 ; CHECK-NEXT: #NO_APP
225 ; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
226 ; CHECK-NEXT: pextl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload
227 ; CHECK-NEXT: popq %rbx
228 ; CHECK-NEXT: .cfi_def_cfa_offset 48
229 ; CHECK-NEXT: popq %r12
230 ; CHECK-NEXT: .cfi_def_cfa_offset 40
231 ; CHECK-NEXT: popq %r13
232 ; CHECK-NEXT: .cfi_def_cfa_offset 32
233 ; CHECK-NEXT: popq %r14
234 ; CHECK-NEXT: .cfi_def_cfa_offset 24
235 ; CHECK-NEXT: popq %r15
236 ; CHECK-NEXT: .cfi_def_cfa_offset 16
237 ; CHECK-NEXT: popq %rbp
238 ; CHECK-NEXT: .cfi_def_cfa_offset 8
239 ; CHECK-NEXT: retq
49240 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
50241 %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
51242 ret i32 %2
53244 declare i32 @llvm.x86.bmi.pext.32(i32, i32)
54245
55246 define i64 @stack_fold_pext_u64(i64 %a0, i64 %a1) {
56 ;CHECK-LABEL: stack_fold_pext_u64
57 ;CHECK: pextq {{-?[0-9]*}}(%rsp), %rax, %rax {{.*#+}} 8-byte Folded Reload
247 ; CHECK-LABEL: stack_fold_pext_u64:
248 ; CHECK: # %bb.0:
249 ; CHECK-NEXT: pushq %rbp
250 ; CHECK-NEXT: .cfi_def_cfa_offset 16
251 ; CHECK-NEXT: pushq %r15
252 ; CHECK-NEXT: .cfi_def_cfa_offset 24
253 ; CHECK-NEXT: pushq %r14
254 ; CHECK-NEXT: .cfi_def_cfa_offset 32
255 ; CHECK-NEXT: pushq %r13
256 ; CHECK-NEXT: .cfi_def_cfa_offset 40
257 ; CHECK-NEXT: pushq %r12
258 ; CHECK-NEXT: .cfi_def_cfa_offset 48
259 ; CHECK-NEXT: pushq %rbx
260 ; CHECK-NEXT: .cfi_def_cfa_offset 56
261 ; CHECK-NEXT: .cfi_offset %rbx, -56
262 ; CHECK-NEXT: .cfi_offset %r12, -48
263 ; CHECK-NEXT: .cfi_offset %r13, -40
264 ; CHECK-NEXT: .cfi_offset %r14, -32
265 ; CHECK-NEXT: .cfi_offset %r15, -24
266 ; CHECK-NEXT: .cfi_offset %rbp, -16
267 ; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
268 ; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
269 ; CHECK-NEXT: #APP
270 ; CHECK-NEXT: nop
271 ; CHECK-NEXT: #NO_APP
272 ; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
273 ; CHECK-NEXT: pextq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload
274 ; CHECK-NEXT: popq %rbx
275 ; CHECK-NEXT: .cfi_def_cfa_offset 48
276 ; CHECK-NEXT: popq %r12
277 ; CHECK-NEXT: .cfi_def_cfa_offset 40
278 ; CHECK-NEXT: popq %r13
279 ; CHECK-NEXT: .cfi_def_cfa_offset 32
280 ; CHECK-NEXT: popq %r14
281 ; CHECK-NEXT: .cfi_def_cfa_offset 24
282 ; CHECK-NEXT: popq %r15
283 ; CHECK-NEXT: .cfi_def_cfa_offset 16
284 ; CHECK-NEXT: popq %rbp
285 ; CHECK-NEXT: .cfi_def_cfa_offset 8
286 ; CHECK-NEXT: retq
58287 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
59288 %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)
60289 ret i64 %2
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -O3 -verify-machineinstrs -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c < %s | FileCheck %s
12
23 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
89 ; relevant registers and check that the reload is correctly folded into the instruction.
910
1011 define <2 x double> @stack_fold_addpd(<2 x double> %a0, <2 x double> %a1) {
11 ;CHECK-LABEL: stack_fold_addpd
12 ;CHECK: vaddpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
12 ; CHECK-LABEL: stack_fold_addpd:
13 ; CHECK: # %bb.0:
14 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15 ; CHECK-NEXT: #APP
16 ; CHECK-NEXT: nop
17 ; CHECK-NEXT: #NO_APP
18 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
19 ; CHECK-NEXT: retq
1320 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
1421 %2 = fadd <2 x double> %a0, %a1
1522 ret <2 x double> %2
1623 }
1724
1825 define <4 x double> @stack_fold_addpd_ymm(<4 x double> %a0, <4 x double> %a1) {
19 ;CHECK-LABEL: stack_fold_addpd_ymm
20 ;CHECK: vaddpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
26 ; CHECK-LABEL: stack_fold_addpd_ymm:
27 ; CHECK: # %bb.0:
28 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
29 ; CHECK-NEXT: #APP
30 ; CHECK-NEXT: nop
31 ; CHECK-NEXT: #NO_APP
32 ; CHECK-NEXT: vaddpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
33 ; CHECK-NEXT: retq
2134 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
2235 %2 = fadd <4 x double> %a0, %a1
2336 ret <4 x double> %2
2437 }
2538
2639 define <4 x float> @stack_fold_addps(<4 x float> %a0, <4 x float> %a1) {
27 ;CHECK-LABEL: stack_fold_addps
28 ;CHECK: vaddps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
40 ; CHECK-LABEL: stack_fold_addps:
41 ; CHECK: # %bb.0:
42 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
43 ; CHECK-NEXT: #APP
44 ; CHECK-NEXT: nop
45 ; CHECK-NEXT: #NO_APP
46 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
47 ; CHECK-NEXT: retq
2948 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3049 %2 = fadd <4 x float> %a0, %a1
3150 ret <4 x float> %2
3251 }
3352
3453 define <8 x float> @stack_fold_addps_ymm(<8 x float> %a0, <8 x float> %a1) {
35 ;CHECK-LABEL: stack_fold_addps_ymm
36 ;CHECK: vaddps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
54 ; CHECK-LABEL: stack_fold_addps_ymm:
55 ; CHECK: # %bb.0:
56 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
57 ; CHECK-NEXT: #APP
58 ; CHECK-NEXT: nop
59 ; CHECK-NEXT: #NO_APP
60 ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
61 ; CHECK-NEXT: retq
3762 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
3863 %2 = fadd <8 x float> %a0, %a1
3964 ret <8 x float> %2
4065 }
4166
4267 define double @stack_fold_addsd(double %a0, double %a1) {
43 ;CHECK-LABEL: stack_fold_addsd
44 ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
68 ; CHECK-LABEL: stack_fold_addsd:
69 ; CHECK: # %bb.0:
70 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
71 ; CHECK-NEXT: #APP
72 ; CHECK-NEXT: nop
73 ; CHECK-NEXT: #NO_APP
74 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
75 ; CHECK-NEXT: retq
4576 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
4677 %2 = fadd double %a0, %a1
4778 ret double %2
4879 }
4980
5081 define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {
51 ;CHECK-LABEL: stack_fold_addsd_int
52 ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
82 ; CHECK-LABEL: stack_fold_addsd_int:
83 ; CHECK: # %bb.0:
84 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
85 ; CHECK-NEXT: #APP
86 ; CHECK-NEXT: nop
87 ; CHECK-NEXT: #NO_APP
88 ; CHECK-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
89 ; CHECK-NEXT: retq
5390 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
5491 %2 = extractelement <2 x double> %a0, i32 0
5592 %3 = extractelement <2 x double> %a1, i32 0
6097 declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
6198
6299 define float @stack_fold_addss(float %a0, float %a1) {
63 ;CHECK-LABEL: stack_fold_addss
64 ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
100 ; CHECK-LABEL: stack_fold_addss:
101 ; CHECK: # %bb.0:
102 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
103 ; CHECK-NEXT: #APP
104 ; CHECK-NEXT: nop
105 ; CHECK-NEXT: #NO_APP
106 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
107 ; CHECK-NEXT: retq
65108 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
66109 %2 = fadd float %a0, %a1
67110 ret float %2
68111 }
69112
70113 define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {
71 ;CHECK-LABEL: stack_fold_addss_int
72 ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
114 ; CHECK-LABEL: stack_fold_addss_int:
115 ; CHECK: # %bb.0:
116 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
117 ; CHECK-NEXT: #APP
118 ; CHECK-NEXT: nop
119 ; CHECK-NEXT: #NO_APP
120 ; CHECK-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
121 ; CHECK-NEXT: retq
73122 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
74123 %2 = extractelement <4 x float> %a0, i32 0
75124 %3 = extractelement <4 x float> %a1, i32 0
80129 declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
81130
82131 define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
83 ;CHECK-LABEL: stack_fold_addsubpd
84 ;CHECK: vaddsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
132 ; CHECK-LABEL: stack_fold_addsubpd:
133 ; CHECK: # %bb.0:
134 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
135 ; CHECK-NEXT: #APP
136 ; CHECK-NEXT: nop
137 ; CHECK-NEXT: #NO_APP
138 ; CHECK-NEXT: vaddsubpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
139 ; CHECK-NEXT: retq
85140 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
86141 %2 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
87142 ret <2 x double> %2
89144 declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
90145
91146 define <4 x double> @stack_fold_addsubpd_ymm(<4 x double> %a0, <4 x double> %a1) {
92 ;CHECK-LABEL: stack_fold_addsubpd_ymm
93 ;CHECK: vaddsubpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
147 ; CHECK-LABEL: stack_fold_addsubpd_ymm:
148 ; CHECK: # %bb.0:
149 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
150 ; CHECK-NEXT: #APP
151 ; CHECK-NEXT: nop
152 ; CHECK-NEXT: #NO_APP
153 ; CHECK-NEXT: vaddsubpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
154 ; CHECK-NEXT: retq
94155 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
95156 %2 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
96157 ret <4 x double> %2
98159 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
99160
100161 define <4 x float> @stack_fold_addsubps(<4 x float> %a0, <4 x float> %a1) {
101 ;CHECK-LABEL: stack_fold_addsubps
102 ;CHECK: vaddsubps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
162 ; CHECK-LABEL: stack_fold_addsubps:
163 ; CHECK: # %bb.0:
164 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
165 ; CHECK-NEXT: #APP
166 ; CHECK-NEXT: nop
167 ; CHECK-NEXT: #NO_APP
168 ; CHECK-NEXT: vaddsubps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
169 ; CHECK-NEXT: retq
103170 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
104171 %2 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
105172 ret <4 x float> %2
107174 declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
108175
109176 define <8 x float> @stack_fold_addsubps_ymm(<8 x float> %a0, <8 x float> %a1) {
110 ;CHECK-LABEL: stack_fold_addsubps_ymm
111 ;CHECK: vaddsubps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
177 ; CHECK-LABEL: stack_fold_addsubps_ymm:
178 ; CHECK: # %bb.0:
179 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
180 ; CHECK-NEXT: #APP
181 ; CHECK-NEXT: nop
182 ; CHECK-NEXT: #NO_APP
183 ; CHECK-NEXT: vaddsubps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
184 ; CHECK-NEXT: retq
112185 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
113186 %2 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
114187 ret <8 x float> %2
116189 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
117190
118191 define <2 x double> @stack_fold_andnpd(<2 x double> %a0, <2 x double> %a1) {
119 ;CHECK-LABEL: stack_fold_andnpd
120 ;CHECK: vandnpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
192 ; CHECK-LABEL: stack_fold_andnpd:
193 ; CHECK: # %bb.0:
194 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
195 ; CHECK-NEXT: #APP
196 ; CHECK-NEXT: nop
197 ; CHECK-NEXT: #NO_APP
198 ; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
199 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
200 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
201 ; CHECK-NEXT: retq
121202 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
122203 %2 = bitcast <2 x double> %a0 to <2 x i64>
123204 %3 = bitcast <2 x double> %a1 to <2 x i64>
130211 }
131212
132213 define <4 x double> @stack_fold_andnpd_ymm(<4 x double> %a0, <4 x double> %a1) {
133 ;CHECK-LABEL: stack_fold_andnpd_ymm
134 ;CHECK: vandnpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
214 ; CHECK-LABEL: stack_fold_andnpd_ymm:
215 ; CHECK: # %bb.0:
216 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
217 ; CHECK-NEXT: #APP
218 ; CHECK-NEXT: nop
219 ; CHECK-NEXT: #NO_APP
220 ; CHECK-NEXT: vandnpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
221 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
222 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
223 ; CHECK-NEXT: retq
135224 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
136225 %2 = bitcast <4 x double> %a0 to <4 x i64>
137226 %3 = bitcast <4 x double> %a1 to <4 x i64>
144233 }
145234
146235 define <4 x float> @stack_fold_andnps(<4 x float> %a0, <4 x float> %a1) {
147 ;CHECK-LABEL: stack_fold_andnps
148 ;CHECK: vandnps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
236 ; CHECK-LABEL: stack_fold_andnps:
237 ; CHECK: # %bb.0:
238 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
239 ; CHECK-NEXT: #APP
240 ; CHECK-NEXT: nop
241 ; CHECK-NEXT: #NO_APP
242 ; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
243 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
244 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
245 ; CHECK-NEXT: retq
149246 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
150247 %2 = bitcast <4 x float> %a0 to <2 x i64>
151248 %3 = bitcast <4 x float> %a1 to <2 x i64>
158255 }
159256
160257 define <8 x float> @stack_fold_andnps_ymm(<8 x float> %a0, <8 x float> %a1) {
161 ;CHECK-LABEL: stack_fold_andnps_ymm
162 ;CHECK: vandnps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
258 ; CHECK-LABEL: stack_fold_andnps_ymm:
259 ; CHECK: # %bb.0:
260 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
261 ; CHECK-NEXT: #APP
262 ; CHECK-NEXT: nop
263 ; CHECK-NEXT: #NO_APP
264 ; CHECK-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
265 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
266 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
267 ; CHECK-NEXT: retq
163268 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
164269 %2 = bitcast <8 x float> %a0 to <4 x i64>
165270 %3 = bitcast <8 x float> %a1 to <4 x i64>
172277 }
173278
174279 define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) {
175 ;CHECK-LABEL: stack_fold_andpd
176 ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
280 ; CHECK-LABEL: stack_fold_andpd:
281 ; CHECK: # %bb.0:
282 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
283 ; CHECK-NEXT: #APP
284 ; CHECK-NEXT: nop
285 ; CHECK-NEXT: #NO_APP
286 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
287 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
288 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
289 ; CHECK-NEXT: retq
177290 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
178291 %2 = bitcast <2 x double> %a0 to <2 x i64>
179292 %3 = bitcast <2 x double> %a1 to <2 x i64>
185298 }
186299
187300 define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) {
188 ;CHECK-LABEL: stack_fold_andpd_ymm
189 ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
301 ; CHECK-LABEL: stack_fold_andpd_ymm:
302 ; CHECK: # %bb.0:
303 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
304 ; CHECK-NEXT: #APP
305 ; CHECK-NEXT: nop
306 ; CHECK-NEXT: #NO_APP
307 ; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
308 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
309 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
310 ; CHECK-NEXT: retq
190311 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
191312 %2 = bitcast <4 x double> %a0 to <4 x i64>
192313 %3 = bitcast <4 x double> %a1 to <4 x i64>
198319 }
199320
200321 define <4 x float> @stack_fold_andps(<4 x float> %a0, <4 x float> %a1) {
201 ;CHECK-LABEL: stack_fold_andps
202 ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
322 ; CHECK-LABEL: stack_fold_andps:
323 ; CHECK: # %bb.0:
324 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
325 ; CHECK-NEXT: #APP
326 ; CHECK-NEXT: nop
327 ; CHECK-NEXT: #NO_APP
328 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
329 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
330 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
331 ; CHECK-NEXT: retq
203332 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
204333 %2 = bitcast <4 x float> %a0 to <2 x i64>
205334 %3 = bitcast <4 x float> %a1 to <2 x i64>
211340 }
212341
213342 define <8 x float> @stack_fold_andps_ymm(<8 x float> %a0, <8 x float> %a1) {
214 ;CHECK-LABEL: stack_fold_andps_ymm
215 ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
343 ; CHECK-LABEL: stack_fold_andps_ymm:
344 ; CHECK: # %bb.0:
345 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
346 ; CHECK-NEXT: #APP
347 ; CHECK-NEXT: nop
348 ; CHECK-NEXT: #NO_APP
349 ; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
350 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
351 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
352 ; CHECK-NEXT: retq
216353 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
217354 %2 = bitcast <8 x float> %a0 to <4 x i64>
218355 %3 = bitcast <8 x float> %a1 to <4 x i64>
224361 }
225362
226363 define <2 x double> @stack_fold_blendpd(<2 x double> %a0, <2 x double> %a1) {
227 ;CHECK-LABEL: stack_fold_blendpd
228 ;CHECK: vblendpd $2, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
364 ; CHECK-LABEL: stack_fold_blendpd:
365 ; CHECK: # %bb.0:
366 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
367 ; CHECK-NEXT: #APP
368 ; CHECK-NEXT: nop
369 ; CHECK-NEXT: #NO_APP
370 ; CHECK-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
371 ; CHECK-NEXT: # xmm0 = xmm0[0],mem[1]
372 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
373 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
374 ; CHECK-NEXT: retq
229375 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
230376 %2 = select <2 x i1> , <2 x double> %a0, <2 x double> %a1
231377 ; fadd forces execution domain
234380 }
235381
236382 define <4 x double> @stack_fold_blendpd_ymm(<4 x double> %a0, <4 x double> %a1) {
237 ;CHECK-LABEL: stack_fold_blendpd_ymm
238 ;CHECK: vblendpd $6, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
383 ; CHECK-LABEL: stack_fold_blendpd_ymm:
384 ; CHECK: # %bb.0:
385 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
386 ; CHECK-NEXT: #APP
387 ; CHECK-NEXT: nop
388 ; CHECK-NEXT: #NO_APP
389 ; CHECK-NEXT: vblendpd $6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
390 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[1,2],ymm0[3]
391 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
392 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
393 ; CHECK-NEXT: retq
239394 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
240395 %2 = select <4 x i1> , <4 x double> %a0, <4 x double> %a1
241396 ; fadd forces execution domain
243398 ret <4 x double> %3}
244399
245400 define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
246 ;CHECK-LABEL: stack_fold_blendps
247 ;CHECK: vblendps $6, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
248401 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
249402 %2 = select <4 x i1> , <4 x float> %a0, <4 x float> %a1
250403 ; fadd forces execution domain
253406 }
254407
255408 define <8 x float> @stack_fold_blendps_ymm(<8 x float> %a0, <8 x float> %a1) {
256 ;CHECK-LABEL: stack_fold_blendps_ymm
257 ;CHECK: vblendps $102, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
409 ; CHECK-LABEL: stack_fold_blendps_ymm:
410 ; CHECK: # %bb.0:
411 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
412 ; CHECK-NEXT: #APP
413 ; CHECK-NEXT: nop
414 ; CHECK-NEXT: #NO_APP
415 ; CHECK-NEXT: vblendps $102, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
416 ; CHECK-NEXT: # ymm0 = ymm0[0],mem[1,2],ymm0[3,4],mem[5,6],ymm0[7]
417 ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
418 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
419 ; CHECK-NEXT: retq
258420 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
259421 %2 = select <8 x i1> , <8 x float> %a0, <8 x float> %a1
260422 ; fadd forces execution domain
263425 }
264426
265427 define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
266 ;CHECK-LABEL: stack_fold_blendvpd
267 ;CHECK: vblendvpd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
428 ; CHECK-LABEL: stack_fold_blendvpd:
429 ; CHECK: # %bb.0:
430 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
431 ; CHECK-NEXT: #APP
432 ; CHECK-NEXT: nop
433 ; CHECK-NEXT: #NO_APP
434 ; CHECK-NEXT: vblendvpd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
435 ; CHECK-NEXT: retq
268436 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
269437 %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
270438 ret <2 x double> %2
272440 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
273441
274442 define <4 x double> @stack_fold_blendvpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %c) {
275 ;CHECK-LABEL: stack_fold_blendvpd_ymm
276 ;CHECK: vblendvpd {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
443 ; CHECK-LABEL: stack_fold_blendvpd_ymm:
444 ; CHECK: # %bb.0:
445 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
446 ; CHECK-NEXT: #APP
447 ; CHECK-NEXT: nop
448 ; CHECK-NEXT: #NO_APP
449 ; CHECK-NEXT: vblendvpd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
450 ; CHECK-NEXT: retq
277451 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
278452 %2 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a1, <4 x double> %c, <4 x double> %a0)
279453 ret <4 x double> %2
281455 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
282456
283457 define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
284 ;CHECK-LABEL: stack_fold_blendvps
285 ;CHECK: vblendvps {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
458 ; CHECK-LABEL: stack_fold_blendvps:
459 ; CHECK: # %bb.0:
460 ; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
461 ; CHECK-NEXT: #APP
462 ; CHECK-NEXT: nop
463 ; CHECK-NEXT: #NO_APP
464 ; CHECK-NEXT: vblendvps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
465 ; CHECK-NEXT: retq
286466 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
287467 %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
288468 ret <4 x float> %2
290470 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
291471
292472 define <8 x float> @stack_fold_blendvps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %c) {
293 ;CHECK-LABEL: stack_fold_blendvps_ymm
294 ;CHECK: vblendvps {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
473 ; CHECK-LABEL: stack_fold_blendvps_ymm:
474 ; CHECK: # %bb.0:
475 ; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
476 ; CHECK-NEXT: #APP
477 ; CHECK-NEXT: nop
478 ; CHECK-NEXT: #NO_APP
479 ; CHECK-NEXT: vblendvps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
480 ; CHECK-NEXT: retq
295481 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
296482 %2 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a1, <8 x float> %c, <8 x float> %a0)
297483 ret <8 x float> %2
299485 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
300486
301487 define <2 x double> @stack_fold_cmppd(<2 x double> %a0, <2 x double> %a1) {
302 ;CHECK-LABEL: stack_fold_cmppd
303 ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
488 ; CHECK-LABEL: stack_fold_cmppd:
489 ; CHECK: # %bb.0:
490 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
491 ; CHECK-NEXT: #APP
492 ; CHECK-NEXT: nop
493 ; CHECK-NEXT: #NO_APP
494 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
495 ; CHECK-NEXT: retq
304496 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
305497 %2 = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 0)
306498 ret <2 x double> %2
308500 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
309501
310502 define <4 x double> @stack_fold_cmppd_ymm(<4 x double> %a0, <4 x double> %a1) {
311 ;CHECK-LABEL: stack_fold_cmppd_ymm
312 ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
503 ; CHECK-LABEL: stack_fold_cmppd_ymm:
504 ; CHECK: # %bb.0:
505 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
506 ; CHECK-NEXT: #APP
507 ; CHECK-NEXT: nop
508 ; CHECK-NEXT: #NO_APP
509 ; CHECK-NEXT: vcmpeqpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
510 ; CHECK-NEXT: retq
313511 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
314512 %2 = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
315513 ret <4 x double> %2
317515 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
318516
319517 define <4 x float> @stack_fold_cmpps(<4 x float> %a0, <4 x float> %a1) {
320 ;CHECK-LABEL: stack_fold_cmpps
321 ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
518 ; CHECK-LABEL: stack_fold_cmpps:
519 ; CHECK: # %bb.0:
520 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
521 ; CHECK-NEXT: #APP
522 ; CHECK-NEXT: nop
523 ; CHECK-NEXT: #NO_APP
524 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
525 ; CHECK-NEXT: retq
322526 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
323527 %2 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 0)
324528 ret <4 x float> %2
326530 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
327531
328532 define <8 x float> @stack_fold_cmpps_ymm(<8 x float> %a0, <8 x float> %a1) {
329 ;CHECK-LABEL: stack_fold_cmpps_ymm
330 ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
533 ; CHECK-LABEL: stack_fold_cmpps_ymm:
534 ; CHECK: # %bb.0:
535 ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
536 ; CHECK-NEXT: #APP
537 ; CHECK-NEXT: nop
538 ; CHECK-NEXT: #NO_APP
539 ; CHECK-NEXT: vcmpeqps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
540 ; CHECK-NEXT: retq
331541 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
332542 %2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0)
333543 ret <8 x float> %2
335545 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
336546
337547 define i32 @stack_fold_cmpsd(double %a0, double %a1) {
338 ;CHECK-LABEL: stack_fold_cmpsd
339 ;CHECK: vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
548 ; CHECK-LABEL: stack_fold_cmpsd:
549 ; CHECK: # %bb.0:
550 ; CHECK-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
551 ; CHECK-NEXT: #APP
552 ; CHECK-NEXT: nop
553 ; CHECK-NEXT: #NO_APP
554 ; CHECK-NEXT: vcmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
555 ; CHECK-NEXT: vmovq %xmm0, %rax
556 ; CHECK-NEXT: andl $1, %eax
557 ; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
558 ; CHECK-NEXT: retq
340559 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
341560 %2 = fcmp oeq double %a0, %a1
342561 %3 = zext i1 %2 to i32
344563 }
345564
346565 define <2 x double> @stack_fold_cmpsd_int(<2 x double> %a0, <2 x double> %a1) {
347 ;CHECK-LABEL: stack_fold_cmpsd_int
348 ;CHECK: vcmpeqsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
566 ; CHECK-LABEL: stack_fold_cmpsd_int:
567 ; CHECK: # %bb.0:
568 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
569 ; CHECK-NEXT: #APP
570 ; CHECK-NEXT: nop
571 ; CHECK-NEXT: #NO_APP
572 ; CHECK-NEXT: vcmpeqsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
573 ; CHECK-NEXT: retq
349574 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
350575 %2 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
351576 ret <2 x double> %2
353578 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
354579
355580 define i32 @stack_fold_cmpss(float %a0, float %a1) {
356 ;CHECK-LABEL: stack_fold_cmpss
357 ;CHECK: vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
581 ; CHECK-LABEL: stack_fold_cmpss:
582 ; CHECK: # %bb.0:
583 ; CHECK-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
584 ; CHECK-NEXT: #APP
585 ; CHECK-NEXT: nop
586 ; CHECK-NEXT: #NO_APP
587 ; CHECK-NEXT: vcmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
588 ; CHECK-NEXT: vmovd %xmm0, %eax
589 ; CHECK-NEXT: andl $1, %eax
590 ; CHECK-NEXT: retq
358591 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
359592 %2 = fcmp oeq float %a0, %a1
360593 %3 = zext i1 %2 to i32
362595 }
363596
364597 define <4 x float> @stack_fold_cmpss_int(<4 x float> %a0, <4 x float> %a1) {
365 ;CHECK-LABEL: stack_fold_cmpss_int
366 ;CHECK: vcmpeqss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
598 ; CHECK-LABEL: stack_fold_cmpss_int:
599 ; CHECK: # %bb.0:
600 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
601 ; CHECK-NEXT: #APP
602 ; CHECK-NEXT: nop
603 ; CHECK-NEXT: #NO_APP
604 ; CHECK-NEXT: vcmpeqss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
605 ; CHECK-NEXT: retq
367606 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
368607 %2 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
369608 ret <4 x float> %2
373612 ; TODO stack_fold_comisd
374613
375614 define i32 @stack_fold_comisd_int(<2 x double> %a0, <2 x double> %a1) {
376 ;CHECK-LABEL: stack_fold_comisd_int
377 ;CHECK: vcomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
615 ; CHECK-LABEL: stack_fold_comisd_int:
616 ; CHECK: # %bb.0:
617 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
618 ; CHECK-NEXT: #APP
619 ; CHECK-NEXT: nop
620 ; CHECK-NEXT: #NO_APP
621 ; CHECK-NEXT: vcomisd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
622 ; CHECK-NEXT: setnp %al
623 ; CHECK-NEXT: sete %cl
624 ; CHECK-NEXT: andb %al, %cl
625 ; CHECK-NEXT: movzbl %cl, %eax
626 ; CHECK-NEXT: retq
378627 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
379628 %2 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
380629 ret i32 %2
384633 ; TODO stack_fold_comiss
385634
386635 define i32 @stack_fold_comiss_int(<4 x float> %a0, <4 x float> %a1) {
387 ;CHECK-LABEL: stack_fold_comiss_int
388 ;CHECK: vcomiss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
636 ; CHECK-LABEL: stack_fold_comiss_int:
637 ; CHECK: # %bb.0:
638 ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
639 ; CHECK-NEXT: #APP
640 ; CHECK-NEXT: nop
641 ; CHECK-NEXT: #NO_APP
642 ; CHECK-NEXT: vcomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
643 ; CHECK-NEXT: setnp %al
644 ; CHECK-NEXT: sete %cl
645 ; CHECK-NEXT: andb %al, %cl
646 ; CHECK-NEXT: movzbl %cl, %eax
647 ; CHECK-NEXT: retq
389648 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
390649 %2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
391650 ret i32 %2
393652 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
394653
395654 define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
396 ;CHECK-LABEL: stack_fold_cvtdq2pd
397 ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
655 ; CHECK-LABEL: stack_fold_cvtdq2pd:
656 ; CHECK: # %bb.0:
657 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
658 ; CHECK-NEXT: #APP
659 ; CHECK-NEXT: nop
660 ; CHECK-NEXT: #NO_APP
661 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
662 ; CHECK-NEXT: retq
398663 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
399664 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32>
400665 %3 = sitofp <2 x i32> %2 to <2 x double>
401666 ret <2 x double> %3
402667 }
403668 define <2 x double> @stack_fold_cvtdq2pd_int(<4 x i32> %a0) {
404 ;CHECK-LABEL: stack_fold_cvtdq2pd_int
405 ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
669 ; CHECK-LABEL: stack_fold_cvtdq2pd_int:
670 ; CHECK: # %bb.0:
671 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
672 ; CHECK-NEXT: #APP
673 ; CHECK-NEXT: nop
674 ; CHECK-NEXT: #NO_APP
675 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
676 ; CHECK-NEXT: retq
406677 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
407678 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a0, <2 x i32>
408679 %cvt = sitofp <2 x i32> %2 to <2 x double>
410681 }
411682
412683 define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
413 ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm
414 ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
684 ; CHECK-LABEL: stack_fold_cvtdq2pd_ymm:
685 ; CHECK: # %bb.0:
686 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
687 ; CHECK-NEXT: #APP
688 ; CHECK-NEXT: nop
689 ; CHECK-NEXT: #NO_APP
690 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
691 ; CHECK-NEXT: retq
415692 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
416693 %2 = sitofp <4 x i32> %a0 to <4 x double>
417694 ret <4 x double> %2
418695 }
419696
420697 define <4 x double> @stack_fold_cvtdq2pd_ymm_int(<4 x i32> %a0) {
421 ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int
422 ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
698 ; CHECK-LABEL: stack_fold_cvtdq2pd_ymm_int:
699 ; CHECK: # %bb.0:
700 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
701 ; CHECK-NEXT: #APP
702 ; CHECK-NEXT: nop
703 ; CHECK-NEXT: #NO_APP
704 ; CHECK-NEXT: vcvtdq2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
705 ; CHECK-NEXT: retq
423706 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
424707 %cvt = sitofp <4 x i32> %a0 to <4 x double>
425708 ret <4 x double> %cvt
426709 }
427710
428711 define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
429 ;CHECK-LABEL: stack_fold_cvtdq2ps
430 ;CHECK: vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
712 ; CHECK-LABEL: stack_fold_cvtdq2ps:
713 ; CHECK: # %bb.0:
714 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
715 ; CHECK-NEXT: #APP
716 ; CHECK-NEXT: nop
717 ; CHECK-NEXT: #NO_APP
718 ; CHECK-NEXT: vcvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
719 ; CHECK-NEXT: retq
431720 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
432721 %2 = sitofp <4 x i32> %a0 to <4 x float>
433722 ret <4 x float> %2
434723 }
435724
436725 define <8 x float> @stack_fold_cvtdq2ps_ymm(<8 x i32> %a0) {
437 ;CHECK-LABEL: stack_fold_cvtdq2ps_ymm
438 ;CHECK: vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
726 ; CHECK-LABEL: stack_fold_cvtdq2ps_ymm:
727 ; CHECK: # %bb.0:
728 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
729 ; CHECK-NEXT: #APP
730 ; CHECK-NEXT: nop
731 ; CHECK-NEXT: #NO_APP
732 ; CHECK-NEXT: vcvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
733 ; CHECK-NEXT: retq
439734 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
440735 %2 = sitofp <8 x i32> %a0 to <8 x float>
441736 ret <8 x float> %2
442737 }
443738
444739 define <4 x i32> @stack_fold_cvtpd2dq(<2 x double> %a0) {
445 ;CHECK-LABEL: stack_fold_cvtpd2dq
446 ;CHECK: vcvtpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
740 ; CHECK-LABEL: stack_fold_cvtpd2dq:
741 ; CHECK: # %bb.0:
742 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
743 ; CHECK-NEXT: #APP
744 ; CHECK-NEXT: nop
745 ; CHECK-NEXT: #NO_APP
746 ; CHECK-NEXT: vcvtpd2dqx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
747 ; CHECK-NEXT: retq
447748 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
448749 %2 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
449750 ret <4 x i32> %2
451752 declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
452753
453754 define <4 x i32> @stack_fold_cvtpd2dq_ymm(<4 x double> %a0) {
454 ;CHECK-LABEL: stack_fold_cvtpd2dq_ymm
455 ;CHECK: vcvtpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
755 ; CHECK-LABEL: stack_fold_cvtpd2dq_ymm:
756 ; CHECK: # %bb.0:
757 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
758 ; CHECK-NEXT: #APP
759 ; CHECK-NEXT: nop
760 ; CHECK-NEXT: #NO_APP
761 ; CHECK-NEXT: vcvtpd2dqy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
762 ; CHECK-NEXT: vzeroupper
763 ; CHECK-NEXT: retq
456764 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
457765 %2 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
458766 ret <4 x i32> %2
460768 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
461769
462770 define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
463 ;CHECK-LABEL: stack_fold_cvtpd2ps
464 ;CHECK: vcvtpd2psx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
771 ; CHECK-LABEL: stack_fold_cvtpd2ps:
772 ; CHECK: # %bb.0:
773 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
774 ; CHECK-NEXT: #APP
775 ; CHECK-NEXT: nop
776 ; CHECK-NEXT: #NO_APP
777 ; CHECK-NEXT: vcvtpd2psx {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
778 ; CHECK-NEXT: retq
465779 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
466780 %2 = fptrunc <2 x double> %a0 to <2 x float>
467781 ret <2 x float> %2
468782 }
469783
470784 define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
471 ;CHECK-LABEL: stack_fold_cvtpd2ps_ymm
472 ;CHECK: vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
785 ; CHECK-LABEL: stack_fold_cvtpd2ps_ymm:
786 ; CHECK: # %bb.0:
787 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
788 ; CHECK-NEXT: #APP
789 ; CHECK-NEXT: nop
790 ; CHECK-NEXT: #NO_APP
791 ; CHECK-NEXT: vcvtpd2psy {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
792 ; CHECK-NEXT: vzeroupper
793 ; CHECK-NEXT: retq
473794 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
474795 %2 = fptrunc <4 x double> %a0 to <4 x float>
475796 ret <4 x float> %2
476797 }
477798
478799 define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) {
479 ;CHECK-LABEL: stack_fold_cvtph2ps
480 ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
800 ; CHECK-LABEL: stack_fold_cvtph2ps:
801 ; CHECK: # %bb.0:
802 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
803 ; CHECK-NEXT: #APP
804 ; CHECK-NEXT: nop
805 ; CHECK-NEXT: #NO_APP
806 ; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
807 ; CHECK-NEXT: retq
481808 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
482809 %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
483810 ret <4 x float> %2
485812 declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
486813
487814 define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) {
488 ;CHECK-LABEL: stack_fold_cvtph2ps_ymm
489 ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
815 ; CHECK-LABEL: stack_fold_cvtph2ps_ymm:
816 ; CHECK: # %bb.0:
817 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
818 ; CHECK-NEXT: #APP
819 ; CHECK-NEXT: nop
820 ; CHECK-NEXT: #NO_APP
821 ; CHECK-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
822 ; CHECK-NEXT: retq
490823 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
491824 %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
492825 ret <8 x float> %2
494827 declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
495828
496829 define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
497 ;CHECK-LABEL: stack_fold_cvtps2dq
498 ;CHECK: vcvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
830 ; CHECK-LABEL: stack_fold_cvtps2dq:
831 ; CHECK: # %bb.0:
832 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
833 ; CHECK-NEXT: #APP
834 ; CHECK-NEXT: nop
835 ; CHECK-NEXT: #NO_APP
836 ; CHECK-NEXT: vcvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
837 ; CHECK-NEXT: retq
499838 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
500839 %2 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
501840 ret <4 x i32> %2
503842 declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
504843
505844 define <8 x i32> @stack_fold_cvtps2dq_ymm(<8 x float> %a0) {
506 ;CHECK-LABEL: stack_fold_cvtps2dq_ymm
507 ;CHECK: vcvtps2dq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
845 ; CHECK-LABEL: stack_fold_cvtps2dq_ymm:
846 ; CHECK: # %bb.0:
847 ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
848 ; CHECK-NEXT: #APP
849 ; CHECK-NEXT: nop
850 ; CHECK-NEXT: #NO_APP
851 ; CHECK-NEXT: vcvtps2dq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
852 ; CHECK-NEXT: retq
508853 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
509854 %2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
510855 ret <8 x i32> %2
512857 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
513858
514859 define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {
515 ;CHECK-LABEL: stack_fold_cvtps2pd
516 ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
860 ; CHECK-LABEL: stack_fold_cvtps2pd:
861 ; CHECK: # %bb.0:
862 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
863 ; CHECK-NEXT: #APP
864 ; CHECK-NEXT: nop
865 ; CHECK-NEXT: #NO_APP
866 ; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
867 ; CHECK-NEXT: retq
517868 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
518869 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32>
519870 %3 = fpext <2 x float> %2 to <2 x double>
521872 }
522873
523874 define <2 x double> @stack_fold_cvtps2pd_int(<4 x float> %a0) {
524 ;CHECK-LABEL: stack_fold_cvtps2pd_int
525 ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
875 ; CHECK-LABEL: stack_fold_cvtps2pd_int:
876 ; CHECK: # %bb.0:
877 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
878 ; CHECK-NEXT: #APP
879 ; CHECK-NEXT: nop
880 ; CHECK-NEXT: #NO_APP
881 ; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
882 ; CHECK-NEXT: retq
526883 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
527884 %2 = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32>
528885 %cvtps2pd = fpext <2 x float> %2 to <2 x double>
530887 }
531888
532889 define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {
533 ;CHECK-LABEL: stack_fold_cvtps2pd_ymm
534 ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
890 ; CHECK-LABEL: stack_fold_cvtps2pd_ymm:
891 ; CHECK: # %bb.0:
892 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
893 ; CHECK-NEXT: #APP
894 ; CHECK-NEXT: nop
895 ; CHECK-NEXT: #NO_APP
896 ; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
897 ; CHECK-NEXT: retq
535898 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
536899 %2 = fpext <4 x float> %a0 to <4 x double>
537900 ret <4 x double> %2
538901 }
539902
540903 define <4 x double> @stack_fold_cvtps2pd_ymm_int(<4 x float> %a0) {
541 ;CHECK-LABEL: stack_fold_cvtps2pd_ymm_int
542 ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
904 ; CHECK-LABEL: stack_fold_cvtps2pd_ymm_int:
905 ; CHECK: # %bb.0:
906 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
907 ; CHECK-NEXT: #APP
908 ; CHECK-NEXT: nop
909 ; CHECK-NEXT: #NO_APP
910 ; CHECK-NEXT: vcvtps2pd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
911 ; CHECK-NEXT: retq
543912 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
544913 %cvtps2pd = fpext <4 x float> %a0 to <4 x double>
545914 ret <4 x double> %cvtps2pd
546915 }
547916
548917 define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) {
549 ;CHECK-LABEL: stack_fold_cvtps2ph_ymm
550 ;CHECK: vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
918 ; CHECK-LABEL: stack_fold_cvtps2ph_ymm:
919 ; CHECK: # %bb.0:
920 ; CHECK-NEXT: vcvtps2ph $0, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
921 ; CHECK-NEXT: #APP
922 ; CHECK-NEXT: nop
923 ; CHECK-NEXT: #NO_APP
924 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
925 ; CHECK-NEXT: vzeroupper
926 ; CHECK-NEXT: retq
551927 %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
552928 %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
553929 ret <8 x i16> %1
557933 ; TODO stack_fold_cvtsd2si
558934
559935 define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
560 ;CHECK-LABEL: stack_fold_cvtsd2si_int
561 ;CHECK: vcvtsd2si {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 16-byte Folded Reload
936 ; CHECK-LABEL: stack_fold_cvtsd2si_int:
937 ; CHECK: # %bb.0:
938 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
939 ; CHECK-NEXT: #APP
940 ; CHECK-NEXT: nop
941 ; CHECK-NEXT: #NO_APP
942 ; CHECK-NEXT: vcvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %eax # 16-byte Folded Reload
943 ; CHECK-NEXT: retq
562944 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
563945 %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
564946 ret i32 %2
568950 ; TODO stack_fold_cvtsd2si64
569951
570952 define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
571 ;CHECK-LABEL: stack_fold_cvtsd2si64_int
572 ;CHECK: vcvtsd2si {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 16-byte Folded Reload
953 ; CHECK-LABEL: stack_fold_cvtsd2si64_int:
954 ; CHECK: # %bb.0:
955 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
956 ; CHECK-NEXT: #APP
957 ; CHECK-NEXT: nop
958 ; CHECK-NEXT: #NO_APP
959 ; CHECK-NEXT: vcvtsd2si {{[-0-9]+}}(%r{{[sb]}}p), %rax # 16-byte Folded Reload
960 ; CHECK-NEXT: retq
573961 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
574962 %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
575963 ret i64 %2
577965 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
578966
579967 define double @stack_fold_cvtsi2sd(i32 %a0) {
580 ;CHECK-LABEL: stack_fold_cvtsi2sd
581 ;CHECK: vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
968 ; CHECK-LABEL: stack_fold_cvtsi2sd:
969 ; CHECK: # %bb.0:
970 ; CHECK-NEXT: pushq %rbp
971 ; CHECK-NEXT: .cfi_def_cfa_offset 16
972 ; CHECK-NEXT: pushq %r15
973 ; CHECK-NEXT: .cfi_def_cfa_offset 24
974 ; CHECK-NEXT: pushq %r14
975 ; CHECK-NEXT: .cfi_def_cfa_offset 32
976 ; CHECK-NEXT: pushq %r13
977 ; CHECK-NEXT: .cfi_def_cfa_offset 40
978 ; CHECK-NEXT: pushq %r12
979 ; CHECK-NEXT: .cfi_def_cfa_offset 48
980 ; CHECK-NEXT: pushq %rbx
981 ; CHECK-NEXT: .cfi_def_cfa_offset 56
982 ; CHECK-NEXT: .cfi_offset %rbx, -56
983 ; CHECK-NEXT: .cfi_offset %r12, -48
984 ; CHECK-NEXT: .cfi_offset %r13, -40
985 ; CHECK-NEXT: .cfi_offset %r14, -32
986 ; CHECK-NEXT: .cfi_offset %r15, -24
987 ; CHECK-NEXT: .cfi_offset %rbp, -16
988 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
989 ; CHECK-NEXT: #APP
990 ; CHECK-NEXT: nop
991 ; CHECK-NEXT: #NO_APP
992 ; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
993 ; CHECK-NEXT: popq %rbx
994 ; CHECK-NEXT: .cfi_def_cfa_offset 48
995 ; CHECK-NEXT: popq %r12
996 ; CHECK-NEXT: .cfi_def_cfa_offset 40
997 ; CHECK-NEXT: popq %r13
998 ; CHECK-NEXT: .cfi_def_cfa_offset 32
999 ; CHECK-NEXT: popq %r14
1000 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1001 ; CHECK-NEXT: popq %r15
1002 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1003 ; CHECK-NEXT: popq %rbp
1004 ; CHECK-NEXT: .cfi_def_cfa_offset 8
1005 ; CHECK-NEXT: retq
5821006 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
5831007 %2 = sitofp i32 %a0 to double
5841008 ret double %2
5851009 }
5861010
5871011 define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
588 ;CHECK-LABEL: stack_fold_cvtsi2sd_int
589 ;CHECK: vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
1012 ; CHECK-LABEL: stack_fold_cvtsi2sd_int:
1013 ; CHECK: # %bb.0:
1014 ; CHECK-NEXT: pushq %rbp
1015 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1016 ; CHECK-NEXT: pushq %r15
1017 ; CHECK-NEXT: .cfi_def_cfa_offset 24
1018 ; CHECK-NEXT: pushq %r14
1019 ; CHECK-NEXT: .cfi_def_cfa_offset 32
1020 ; CHECK-NEXT: pushq %r13
1021 ; CHECK-NEXT: .cfi_def_cfa_offset 40
1022 ; CHECK-NEXT: pushq %r12
1023 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1024 ; CHECK-NEXT: pushq %rbx
1025 ; CHECK-NEXT: .cfi_def_cfa_offset 56
1026 ; CHECK-NEXT: .cfi_offset %rbx, -56
1027 ; CHECK-NEXT: .cfi_offset %r12, -48
1028 ; CHECK-NEXT: .cfi_offset %r13, -40
1029 ; CHECK-NEXT: .cfi_offset %r14, -32
1030 ; CHECK-NEXT: .cfi_offset %r15, -24
1031 ; CHECK-NEXT: .cfi_offset %rbp, -16
1032 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1033 ; CHECK-NEXT: #APP
1034 ; CHECK-NEXT: nop
1035 ; CHECK-NEXT: #NO_APP
1036 ; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
1037 ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1038 ; CHECK-NEXT: popq %rbx
1039 ; CHECK-NEXT: .cfi_def_cfa_offset 48
1040 ; CHECK-NEXT: popq %r12