llvm.org GIT mirror llvm / 154874a
[DAGCombiner] If a TokenFactor would be merged into its user, consider the user later. Summary: A number of optimizations are inhibited by single-use TokenFactors not being merged into the TokenFactor using it. This makes we consider if we can do the merge immediately. Most tests changes here are due to the change in visitation causing minor reorderings and associated reassociation of paired memory operations. CodeGen tests with non-reordering changes: X86/aligned-variadic.ll -- memory-based add folded into stored leaq value. X86/constant-combiners.ll -- Optimizes out overlap between stores. X86/pr40631_deadstore_elision -- folds constant byte store into preceding quad word constant store. Reviewers: RKSimon, craig.topper, spatel, efriedma, courbet Reviewed By: courbet Subscribers: dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, javed.absar, eraman, hiraditya, kbarton, jrtc27, atanasyan, jsji, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59260 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356068 91177308-0d34-0410-b5e6-96231b3b80d8 Nirav Dave 11 months ago
52 changed file(s) with 498 addition(s) and 494 deletion(s). Raw diff Collapse all Expand all
17081708 if (OptLevel == CodeGenOpt::None)
17091709 return SDValue();
17101710
1711 // If this is used only a single token factor, we should make sure we have a
1712 // chance to merge them together. This prevents TF chains from inhibiting
1713 // optimizations.
1714 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor)
1715 AddToWorklist(*(N->use_begin()));
1716
17111717 SmallVector TFs; // List of token factors to visit.
17121718 SmallVector Ops; // Ops for replacing token factor.
17131719 SmallPtrSet SeenOps;
44 ; CHECK: str x30, [sp, #-80]!
55 ; CHECK: add x8, sp, #24
66 ; CHECK: add x0, sp, #24
7 ; CHECK: stp x6, x7, [sp, #64]
8 ; CHECK: stp x4, x5, [sp, #48]
9 ; CHECK: stp x2, x3, [sp, #32]
10 ; CHECK: str x1, [sp, #24]
7 ; CHECK: stp x1, x2, [sp, #24]
8 ; CHECK: stp x3, x4, [sp, #40]
9 ; CHECK: stp x5, x6, [sp, #56]
10 ; CHECK: str x7, [sp, #72]
1111 ; CHECK: str x8, [sp, #8]
1212 ; CHECK: bl other_func
1313 ; CHECK: ldr x30, [sp], #80
4343 ; CHECK: sub sp, sp, #96
4444 ; CHECK: stp x29, x30, [sp, #16]
4545 ; CHECK: add x29, sp, #16
46 ; CHECK: str x1, [x29, #24]
46 ; CHECK: stp x1, x2, [x29, #24]
4747 ; CHECK: add x1, x29, #8
4848 ; CHECK: ldp x29, x30, [sp, #16]
4949 ; CHECK: add sp, sp, #96
7777 ; CHECK: stp x29, x30, [sp, #-16]!
7878 ; CHECK: mov x29, sp
7979 ; CHECK: sub sp, sp, #192
80 ; CHECK: stp q6, q7, [x29, #-96]
80 ; CHECK-DAG: stp q6, q7, [x29, #-96]
8181 ; [...]
82 ; CHECK: stp q0, q1, [x29, #-192]
82 ; CHECK-DAG: stp q0, q1, [x29, #-192]
8383
84 ; CHECK: stp x6, x7, [x29, #-16]
84 ; CHECK-DAG: stp x5, x6, [x29, #-24]
8585 ; [...]
86 ; CHECK: stp x2, x3, [x29, #-48]
86 ; CHECK-DAG: stp x1, x2, [x29, #-56]
8787
8888 ; CHECK-NOFP-ARM64: stp x29, x30, [sp, #-16]!
8989 ; CHECK-NOFP-ARM64: mov x29, sp
9090 ; CHECK-NOFP-ARM64: sub sp, sp, #64
91 ; CHECK-NOFP-ARM64: stp x6, x7, [x29, #-16]
91 ; CHECK-NOFP-ARM64-DAG: stp x5, x6, [x29, #-24]
9292 ; [...]
93 ; CHECK-NOFP-ARM64: stp x4, x5, [x29, #-32]
93 ; CHECK-NOFP-ARM64-DAG: stp x3, x4, [x29, #-40]
9494 ; [...]
95 ; CHECK-NOFP-ARM64: stp x2, x3, [x29, #-48]
95 ; CHECK-NOFP-ARM64-DAG: stp x1, x2, [x29, #-56]
9696 ; [...]
9797 ; CHECK-NOFP-ARM64: mov x8, sp
9898
1515 define i32 @t0() {
1616 entry:
1717 ; CHECK-LABEL: t0:
18 ; CHECK: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7]
19 ; CHECK: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7]
20 ; CHECK: ldr [[REG2:x[0-9]+]],
21 ; CHECK: str [[REG2]],
18 ; CHECK-DAG: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7]
19 ; CHECK-DAG: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7]
20 ; CHECK-DAG: ldr [[REG2:x[0-9]+]],
21 ; CHECK-DAG: str [[REG2]],
2222 call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @dst, i32 0, i32 0), i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @src, i32 0, i32 0), i32 11, i1 false)
2323 ret i32 0
2424 }
8484 define void @t6() nounwind {
8585 entry:
8686 ; CHECK-LABEL: t6:
87 ; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
88 ; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
89 ; CHECK: ldr
90 ; CHECK: str
87 ; CHECK-DAG: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
88 ; CHECK-DAG: stur [[REG9]], [x{{[0-9]+}}, #6]
89 ; CHECK-DAG: ldr
90 ; CHECK-DAG: str
9191 call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8], [512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str6, i64 0, i64 0), i64 14, i1 false)
9292 ret void
9393 }
1313 ; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
1414 ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
1515
16 ; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
16 ; CHECK-DAG: stp x6, x7, [sp, #
1717 ; ... omit middle ones ...
18 ; CHECK: str x7, [sp, #
18 ; CHECK-DAG: str x1, [sp, #[[GR_BASE:[0-9]+]]]
1919
20 ; CHECK: stp q0, q1, [sp]
20 ; CHECK-DAG: stp q0, q1, [sp]
2121 ; ... omit middle ones ...
22 ; CHECK: stp q6, q7, [sp, #
22 ; CHECK-DAG: stp q6, q7, [sp, #
2323
2424 ; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]]
2525
4949 ; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
5050 ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
5151
52 ; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
52 ; CHECK-DAG: stp x6, x7, [sp, #
5353 ; ... omit middle ones ...
54 ; CHECK: str x7, [sp, #
54 ; CHECK-DAG: str x3, [sp, #[[GR_BASE:[0-9]+]]]
5555
56 ; CHECK: stp q1, q2, [sp]
56 ; CHECK-DAG: stp q6, q7, [sp, #80]
5757 ; ... omit middle ones ...
58 ; CHECK: str q7, [sp, #
58 ; CHECK-DAG: str q1, [sp]
5959
6060 ; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]]
6161
9494 ; __stack field should point just past them.
9595 define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) {
9696 ; CHECK-LABEL: test_offsetstack:
97 ; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]!
98 ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
99 ; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var
100 ; CHECK: str [[STACK_TOP]], [x[[VAR]]]
97
98 ; CHECK-DAG: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #48]
99 ; CHECK-DAG: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #16]
100 ; CHECK-DAG: str {{q[0-9]+}}, [sp]
101 ; CHECK-DAG: add [[STACK_TOP:x[0-9]+]], sp, #96
102 ; CHECK-DAG: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var
103 ; CHECK-DAG: str [[STACK_TOP]], [x[[VAR]]]
101104
102105 %addr = bitcast %va_list* @var to i8*
103106 call void @llvm.va_start(i8* %addr)
44 ; CHECK: str x30, [sp, #-80]!
55 ; CHECK: add x8, sp, #24
66 ; CHECK: add x0, sp, #24
7 ; CHECK: stp x6, x7, [sp, #64]
8 ; CHECK: stp x4, x5, [sp, #48]
9 ; CHECK: stp x2, x3, [sp, #32]
10 ; CHECK: str x1, [sp, #24]
7 ; CHECK: stp x1, x2, [sp, #24]
8 ; CHECK: stp x3, x4, [sp, #40]
9 ; CHECK: stp x5, x6, [sp, #56]
10 ; CHECK: str x7, [sp, #72]
1111 ; CHECK: str x8, [sp, #8]
1212 ; CHECK: bl other_func
1313 ; CHECK: ldr x30, [sp], #80
7777 ; CHECK-LABEL: copy1:
7878 ; CHECK: sub sp, sp, #80
7979 ; CHECK: add x8, sp, #24
80 ; CHECK: stp x6, x7, [sp, #64]
81 ; CHECK: stp x4, x5, [sp, #48]
82 ; CHECK: stp x2, x3, [sp, #32]
83 ; CHECK: str x1, [sp, #24]
80 ; CHECK: stp x1, x2, [sp, #24]
81 ; CHECK: stp x3, x4, [sp, #40]
82 ; CHECK: stp x5, x6, [sp, #56]
83 ; CHECK: str x7, [sp, #72]
8484 ; CHECK: stp x8, x8, [sp], #80
8585 ; CHECK: ret
8686 define void @copy1(i64 %a0, ...) nounwind {
110110 ; CHECK: mov x19, x2
111111 ; CHECK: mov x20, x1
112112 ; CHECK: mov x21, x0
113 ; CHECK: stp x6, x7, [x29, #48]
114 ; CHECK: stp x4, x5, [x29, #32]
115 ; CHECK: str x3, [x29, #24]
113 ; CHECK: stp x3, x4, [x29, #24]
114 ; CHECK: stp x5, x6, [x29, #40]
115 ; CHECK: str x7, [x29, #56]
116116 ; CHECK: str x8, [sp, #8]
117117 ; CHECK: bl __local_stdio_printf_options
118118 ; CHECK: ldr x8, [x0]
161161 ; CHECK: lsr x15, x8, #4
162162 ; CHECK: mov x19, x1
163163 ; CHECK: mov [[REG2:x[0-9]+]], sp
164 ; CHECK: stp x2, x3, [x29, #16]
165 ; CHECK: stp x4, x5, [x29, #32]
164166 ; CHECK: stp x6, x7, [x29, #48]
165 ; CHECK: stp x4, x5, [x29, #32]
166 ; CHECK: stp x2, x3, [x29, #16]
167167 ; CHECK: bl __chkstk
168168 ; CHECK: mov x8, sp
169169 ; CHECK: sub [[REG:x[0-9]+]], x8, x15, lsl #4
218218 ; CHECK-DAG: mov x19, x2
219219 ; CHECK-DAG: mov x20, x1
220220 ; CHECK-DAG: mov x21, x0
221 ; CHECK-DAG: stp x6, x7, [sp, #80]
222 ; CHECK-DAG: stp x4, x5, [sp, #64]
223 ; CHECK-DAG: str x3, [sp, #56]
221 ; CHECK-DAG: stp x3, x4, [sp, #56]
222 ; CHECK-DAG: stp x5, x6, [sp, #72]
223 ; CHECK-DAG: str x7, [sp, #88]
224224 ; CHECK-DAG: str x8, [sp, #8]
225225 ; CHECK-DAG: bl __local_stdio_printf_options
226226 ; CHECK-DAG: ldr x8, [x0]
751751 ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
752752 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
753753 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:8
754 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
754755 ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8
755 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
756756 ; GCN: s_getpc_b64
757757 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload
758758 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload
4141
4242 ; CHECK-LABEL: test_byval_8_bytes_alignment_fixed_arg:
4343 ; CHECK-NOT: str r1
44 ; CHECK: str r3, [sp, #12]
45 ; CHECK: str r2, [sp, #8]
44 ; CHECK-DAG: str r3, [sp, #12]
45 ; CHECK-DAG: str r2, [sp, #8]
4646 ; CHECK-NOT: str r1
4747 define void @test_byval_8_bytes_alignment_fixed_arg(i32 %n1, %struct_t* byval %val) nounwind {
4848 entry:
66 declare i32 @printf(i8*, ...)
77
88 ; CHECK-LABEL: test_byval_usage_scheduling:
9 ; CHECK: str r3, [sp, #12]
10 ; CHECK: str r2, [sp, #8]
9 ; CHECK-DAG: str r3, [sp, #12]
10 ; CHECK-DAG: str r2, [sp, #8]
1111 ; CHECK: vldr d16, [sp, #8]
1212 define void @test_byval_usage_scheduling(i32 %n1, i32 %n2, %struct_t* byval %val) nounwind {
1313 entry:
3434 ; CHECK: sub sp, sp, #8
3535 ; CHECK: push {r11, lr}
3636 ; CHECK: add r0, sp, #8
37 ; CHECK: str r3, [sp, #12]
38 ; CHECK: str r2, [sp, #8]
37 ; CHECK-DAG: str r3, [sp, #12]
38 ; CHECK-DAG: str r2, [sp, #8]
3939 ; CHECK: bl usePtr
4040 ; CHECK: pop {r11, lr}
4141 ; CHECK: add sp, sp, #8
6969 ; CHECK: push {r11, lr}
7070 ; CHECK: str r0, [sp, #8]
7171 ; CHECK: add r0, sp, #16
72 ; CHECK: str r3, [sp, #20]
73 ; CHECK: str r2, [sp, #16]
72 ; CHECK-DAG: str r3, [sp, #20]
73 ; CHECK-DAG: str r2, [sp, #16]
7474 ; CHECK: bl usePtr
7575 ; CHECK: pop {r11, lr}
7676 ; CHECK: add sp, sp, #16
2424 ; CHECK-7A: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
2525 ; CHECK-6M-LABEL: t2:
2626 ; CHECK-6M: movs [[REG:r[0-9]+]], #0
27 ; CHECK-6M: str [[REG]], [sp, #20]
28 ; CHECK-6M: str [[REG]], [sp, #16]
29 ; CHECK-6M: str [[REG]], [sp, #12]
30 ; CHECK-6M: str [[REG]], [sp, #8]
31 ; CHECK-6M: str [[REG]], [sp, #4]
32 ; CHECK-6M: str [[REG]], [sp]
27 ; CHECK-6M-DAG: str [[REG]], [sp, #20]
28 ; CHECK-6M-DAG: str [[REG]], [sp, #16]
29 ; CHECK-6M-DAG: str [[REG]], [sp, #12]
30 ; CHECK-6M-DAG: str [[REG]], [sp, #8]
31 ; CHECK-6M-DAG: str [[REG]], [sp, #4]
32 ; CHECK-6M-DAG: str [[REG]], [sp]
3333 %buf = alloca [26 x i8], align 1
3434 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
3535 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i1 false)
5656
5757 ; Epilogue
5858 ; --------
59 ; CHECK-V4T: ldr [[POP:r[4567]]], [sp, #12]
59 ; CHECK-V4T: ldr [[POP:r[4567]]], [sp, #16]
6060 ; CHECK-V4T-NEXT: mov lr, [[POP]]
6161 ; CHECK-V4T-NEXT: pop {[[SAVED]]}
6262 ; CHECK-V4T-NEXT: add sp, #16
6363 ; CHECK-V4T-NEXT: bx lr
6464 ; CHECK-V5T: lsls r4
6565 ; CHECK-V5T-NEXT: mov sp, r4
66 ; CHECK-V5T: ldr [[POP:r[4567]]], [sp, #12]
66 ; CHECK-V5T: ldr [[POP:r[4567]]], [sp, #16]
6767 ; CHECK-V5T-NEXT: mov lr, [[POP]]
6868 ; CHECK-V5T-NEXT: pop {[[SAVED]]}
6969 ; CHECK-V5T-NEXT: add sp, #16
1212 define void @t(i8* nocapture %a, i8* nocapture %b) nounwind {
1313 entry:
1414 ; EXPANDED-LABEL: t:
15 ; EXPANDED: ldrb [[R2:r[0-9]+]]
16 ; EXPANDED: ldrb [[R3:r[0-9]+]]
17 ; EXPANDED: ldrb [[R12:r[0-9]+]]
18 ; EXPANDED: ldrb [[R1:r[0-9]+]]
19 ; EXPANDED: strb [[R1]]
20 ; EXPANDED: strb [[R12]]
21 ; EXPANDED: strb [[R3]]
22 ; EXPANDED: strb [[R2]]
15 ; EXPANDED-DAG: ldrb [[R2:r[0-9]+]]
16 ; EXPANDED-DAG: ldrb [[R3:r[0-9]+]]
17 ; EXPANDED-DAG: ldrb [[R12:r[0-9]+]]
18 ; EXPANDED-DAG: ldrb [[R1:r[0-9]+]]
19 ; EXPANDED-DAG: strb [[R1]]
20 ; EXPANDED-DAG: strb [[R12]]
21 ; EXPANDED-DAG: strb [[R3]]
22 ; EXPANDED-DAG: strb [[R2]]
2323
2424 ; UNALIGNED-LABEL: t:
2525 ; UNALIGNED: ldr r1
6565
6666 ; CHECK-LABEL: ret_void_args_i64_i64
6767 define void @ret_void_args_i64_i64(i64 %a, i64 %b) {
68 ; CHECK: sts 11, r25
69 ; CHECK-NEXT: sts 10, r24
70 ; CHECK-NEXT: sts 9, r23
71 ; CHECK-NEXT: sts 8, r22
72 ; CHECK-NEXT: sts 7, r21
73 ; CHECK-NEXT: sts 6, r20
74 ; CHECK-NEXT: sts 5, r19
75 ; CHECK-NEXT: sts 4, r18
68 ; CHECK-DAG: sts 11, r25
69 ; CHECK-DAG: sts 10, r24
70 ; CHECK-DAG: sts 9, r23
71 ; CHECK-DAG: sts 8, r22
72 ; CHECK-DAG: sts 7, r21
73 ; CHECK-DAG: sts 6, r20
74 ; CHECK-DAG: sts 5, r19
75 ; CHECK-DAG: sts 4, r18
7676 store volatile i64 %a, i64* inttoptr (i64 4 to i64*)
7777
78 ; CHECK-NEXT: sts 11, r17
79 ; CHECK-NEXT: sts 10, r16
80 ; CHECK-NEXT: sts 9, r15
81 ; CHECK-NEXT: sts 8, r14
82 ; CHECK-NEXT: sts 7, r13
83 ; CHECK-NEXT: sts 6, r12
84 ; CHECK-NEXT: sts 5, r11
85 ; CHECK-NEXT: sts 4, r10
78 ; CHECK-DAG: sts 11, r17
79 ; CHECK-DAG: sts 10, r16
80 ; CHECK-DAG: sts 9, r15
81 ; CHECK-DAG: sts 8, r14
82 ; CHECK-DAG: sts 7, r13
83 ; CHECK-DAG: sts 6, r12
84 ; CHECK-DAG: sts 5, r11
85 ; CHECK-DAG: sts 4, r10
8686 store volatile i64 %b, i64* inttoptr (i64 4 to i64*)
8787 ret void
8888 }
206206 ; CHECK: sbci r23, 255
207207 ; CHECK: sbci r24, 255
208208 ; CHECK: sbci r25, 255
209 ; CHECK: sts long.static+3, r25
210 ; CHECK: sts long.static+2, r24
211 ; CHECK: sts long.static+1, r23
212 ; CHECK: sts long.static, r22
209 ; CHECK-DAG: sts long.static+3, r25
210 ; CHECK-DAG: sts long.static+2, r24
211 ; CHECK-DAG: sts long.static+1, r23
212 ; CHECK-DAG: sts long.static, r22
213213 %1 = load i32, i32* @long.static
214214 %inc = add nsw i32 %1, 1
215215 store i32 %inc, i32* @long.static
308308 ; CHECK: sbci r23, 255
309309 ; CHECK: sbci r24, 255
310310 ; CHECK: sbci r25, 255
311 ; CHECK: sts longlong.static+7, r25
312 ; CHECK: sts longlong.static+6, r24
313 ; CHECK: sts longlong.static+5, r23
314 ; CHECK: sts longlong.static+4, r22
315 ; CHECK: sts longlong.static+3, r21
316 ; CHECK: sts longlong.static+2, r20
317 ; CHECK: sts longlong.static+1, r19
318 ; CHECK: sts longlong.static, r18
311 ; CHECK-DAG: sts longlong.static+7, r25
312 ; CHECK-DAG: sts longlong.static+6, r24
313 ; CHECK-DAG: sts longlong.static+5, r23
314 ; CHECK-DAG: sts longlong.static+4, r22
315 ; CHECK-DAG: sts longlong.static+3, r21
316 ; CHECK-DAG: sts longlong.static+2, r20
317 ; CHECK-DAG: sts longlong.static+1, r19
318 ; CHECK-DAG: sts longlong.static, r18
319319 %1 = load i64, i64* @longlong.static
320320 %inc = add nsw i64 %1, 1
321321 store i64 %inc, i64* @longlong.static
1919 ; CHECK: *(u64 *)(r10 - 8) = r1
2020
2121 ; CHECK: r1 = 0
22 ; CHECK: *(u16 *)(r10 + 24) = r1
23 ; CHECK: *(u16 *)(r10 + 22) = r1
24 ; CHECK: *(u16 *)(r10 + 20) = r1
25 ; CHECK: *(u16 *)(r10 + 18) = r1
26 ; CHECK: *(u16 *)(r10 + 16) = r1
27 ; CHECK: *(u16 *)(r10 + 14) = r1
28 ; CHECK: *(u16 *)(r10 + 12) = r1
29 ; CHECK: *(u16 *)(r10 + 10) = r1
30 ; CHECK: *(u16 *)(r10 + 8) = r1
31 ; CHECK: *(u16 *)(r10 + 6) = r1
32 ; CHECK: *(u16 *)(r10 + 4) = r1
33 ; CHECK: *(u16 *)(r10 + 2) = r1
34 ; CHECK: *(u16 *)(r10 + 0) = r1
35 ; CHECK: *(u16 *)(r10 + 26) = r1
22 ; CHECK-DAG: *(u16 *)(r10 + 24) = r1
23 ; CHECK-DAG: *(u16 *)(r10 + 22) = r1
24 ; CHECK-DAG: *(u16 *)(r10 + 20) = r1
25 ; CHECK-DAG: *(u16 *)(r10 + 18) = r1
26 ; CHECK-DAG: *(u16 *)(r10 + 16) = r1
27 ; CHECK-DAG: *(u16 *)(r10 + 14) = r1
28 ; CHECK-DAG: *(u16 *)(r10 + 12) = r1
29 ; CHECK-DAG: *(u16 *)(r10 + 10) = r1
30 ; CHECK-DAG: *(u16 *)(r10 + 8) = r1
31 ; CHECK-DAG: *(u16 *)(r10 + 6) = r1
32 ; CHECK-DAG: *(u16 *)(r10 + 4) = r1
33 ; CHECK-DAG: *(u16 *)(r10 + 2) = r1
34 ; CHECK-DAG: *(u16 *)(r10 + 0) = r1
35 ; CHECK-DAG: *(u16 *)(r10 + 26) = r1
3636
3737 ; CHECK: r2 = r10
3838 ; CHECK: r2 += -8
165165 ; CHECK: mov r13, &g_i64+2
166166 ; CHECK: mov r12, &g_i64
167167 store volatile i64 %a, i64* @g_i64, align 2
168 ; CHECK: mov 10(r4), &g_i64+6
169 ; CHECK: mov 8(r4), &g_i64+4
170 ; CHECK: mov 6(r4), &g_i64+2
171 ; CHECK: mov 4(r4), &g_i64
168 ; CHECK-DAG: mov 10(r4), &g_i64+6
169 ; CHECK-DAG: mov 8(r4), &g_i64+4
170 ; CHECK-DAG: mov 6(r4), &g_i64+2
171 ; CHECK-DAG: mov 4(r4), &g_i64
172172 store volatile i64 %b, i64* @g_i64, align 2
173173 ret void
174174 }
1414 ; CHECK-NEXT: .cfi_def_cfa_register 30
1515 ; CHECK-NEXT: addiu $1, $zero, -16
1616 ; CHECK-NEXT: and $sp, $sp, $1
17 ; CHECK-NEXT: lw $1, 8($4)
18 ; CHECK-NEXT: lw $2, 4($4)
19 ; CHECK-NEXT: lw $3, 12($4)
20 ; CHECK-NEXT: sw $3, 12($sp)
21 ; CHECK-NEXT: sw $1, 8($sp)
22 ; CHECK-NEXT: sw $2, 4($sp)
23 ; CHECK-NEXT: lw $1, 0($4)
24 ; CHECK-NEXT: sw $1, 0($sp)
25 ; CHECK-NEXT: mtc1 $1, $f0
17 ; CHECK-NEXT: lw $1, 12($4)
18 ; CHECK-NEXT: lw $2, 0($4)
19 ; CHECK-NEXT: lw $3, 8($4)
20 ; CHECK-NEXT: sw $3, 8($sp)
21 ; CHECK-NEXT: sw $1, 12($sp)
22 ; CHECK-NEXT: sw $2, 0($sp)
23 ; CHECK-NEXT: lw $1, 4($4)
24 ; CHECK-NEXT: sw $1, 4($sp)
25 ; CHECK-NEXT: mtc1 $2, $f0
2626 ; CHECK-NEXT: move $sp, $fp
2727 ; CHECK-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload
2828 ; CHECK-NEXT: jr $ra
2929 ; CHECK-NEXT: addiu $sp, $sp, 32
30 ; CHECK-NEXT: .set at
31 ; CHECK-NEXT: .set macro
32 ; CHECK-NEXT: .set reorder
33 ; CHECK-NEXT: .end f
3430 entry:
3531 %m = alloca <8 x i16>
3632 %0 = load <8 x i16>, <8 x i16>* %a
8181 align 16 %a) {
8282 ; CHECK-LABEL: testStruct_03:
8383 ; CHECK: # %bb.0: # %entry
84 ; CHECK-NEXT: std r10, 88(r1)
85 ; CHECK-NEXT: std r9, 80(r1)
86 ; CHECK-NEXT: std r8, 72(r1)
87 ; CHECK-NEXT: std r7, 64(r1)
88 ; CHECK-NEXT: std r6, 56(r1)
89 ; CHECK-NEXT: std r5, 48(r1)
90 ; CHECK-NEXT: std r4, 40(r1)
91 ; CHECK-NEXT: std r3, 32(r1)
84 ; CHECK-DAG: std r10, 88(r1)
85 ; CHECK-DAG: std r9, 80(r1)
86 ; CHECK-DAG: std r8, 72(r1)
87 ; CHECK-DAG: std r7, 64(r1)
88 ; CHECK-DAG: std r6, 56(r1)
89 ; CHECK-DAG: std r5, 48(r1)
90 ; CHECK-DAG: std r4, 40(r1)
91 ; CHECK-DAG: std r3, 32(r1)
9292 ; CHECK-NEXT: lxv v2, 128(r1)
9393 ; CHECK-NEXT: blr
9494
9595 ; CHECK-BE-LABEL: testStruct_03:
9696 ; CHECK-BE: # %bb.0: # %entry
97 ; CHECK-BE-NEXT: std r10, 104(r1)
98 ; CHECK-BE-NEXT: std r9, 96(r1)
99 ; CHECK-BE-NEXT: std r8, 88(r1)
100 ; CHECK-BE-NEXT: std r7, 80(r1)
101 ; CHECK-BE-NEXT: std r6, 72(r1)
102 ; CHECK-BE-NEXT: std r5, 64(r1)
103 ; CHECK-BE-NEXT: std r4, 56(r1)
104 ; CHECK-BE-NEXT: std r3, 48(r1)
97 ; CHECK-BE-DAG: std r10, 104(r1)
98 ; CHECK-BE-DAG: std r9, 96(r1)
99 ; CHECK-BE-DAG: std r8, 88(r1)
100 ; CHECK-BE-DAG: std r7, 80(r1)
101 ; CHECK-BE-DAG: std r6, 72(r1)
102 ; CHECK-BE-DAG: std r5, 64(r1)
103 ; CHECK-BE-DAG: std r4, 56(r1)
104 ; CHECK-BE-DAG: std r3, 48(r1)
105105 ; CHECK-BE-NEXT: lxv v2, 144(r1)
106106 ; CHECK-BE-NEXT: blr
107107 entry:
255255 define fp128 @testNestedAggregate(%struct.MixedC* byval nocapture readonly align 16 %a) {
256256 ; CHECK-LABEL: testNestedAggregate:
257257 ; CHECK: # %bb.0: # %entry
258 ; CHECK-NEXT: std r8, 72(r1)
259 ; CHECK-NEXT: std r7, 64(r1)
260 ; CHECK-NEXT: std r10, 88(r1)
261 ; CHECK-NEXT: std r9, 80(r1)
262 ; CHECK-NEXT: std r6, 56(r1)
263 ; CHECK-NEXT: std r5, 48(r1)
264 ; CHECK-NEXT: std r4, 40(r1)
265 ; CHECK-NEXT: std r3, 32(r1)
258 ; CHECK-DAG: std r10, 88(r1)
259 ; CHECK-DAG: std r9, 80(r1)
260 ; CHECK-DAG: std r8, 72(r1)
261 ; CHECK-DAG: std r7, 64(r1)
262 ; CHECK-DAG: std r6, 56(r1)
263 ; CHECK-DAG: std r5, 48(r1)
264 ; CHECK-DAG: std r4, 40(r1)
265 ; CHECK-DAG: std r3, 32(r1)
266266 ; CHECK-NEXT: lxv v2, 64(r1)
267267 ; CHECK-NEXT: blr
268268
269269 ; CHECK-BE-LABEL: testNestedAggregate:
270270 ; CHECK-BE: # %bb.0: # %entry
271 ; CHECK-BE-NEXT: std r8, 88(r1)
272 ; CHECK-BE-NEXT: std r7, 80(r1)
273 ; CHECK-BE-NEXT: std r10, 104(r1)
274 ; CHECK-BE-NEXT: std r9, 96(r1)
275 ; CHECK-BE-NEXT: std r6, 72(r1)
276 ; CHECK-BE-NEXT: std r5, 64(r1)
277 ; CHECK-BE-NEXT: std r4, 56(r1)
278 ; CHECK-BE-NEXT: std r3, 48(r1)
271 ; CHECK-BE-DAG: std r8, 88(r1)
272 ; CHECK-BE-DAG: std r7, 80(r1)
273 ; CHECK-BE-DAG: std r10, 104(r1)
274 ; CHECK-BE-DAG: std r9, 96(r1)
275 ; CHECK-BE-DAG: std r6, 72(r1)
276 ; CHECK-BE-DAG: std r5, 64(r1)
277 ; CHECK-BE-DAG: std r4, 56(r1)
278 ; CHECK-BE-DAG: std r3, 48(r1)
279279 ; CHECK-BE-NEXT: lxv v2, 80(r1)
280280 ; CHECK-BE-NEXT: blr
281281 entry:
336336 define fp128 @sum_float128(i32 signext %count, ...) {
337337 ; CHECK-LABEL: sum_float128:
338338 ; CHECK: # %bb.0: # %entry
339 ; CHECK-NEXT: std r10, 88(r1)
340 ; CHECK-NEXT: std r9, 80(r1)
341 ; CHECK-NEXT: std r8, 72(r1)
342 ; CHECK-NEXT: std r7, 64(r1)
343 ; CHECK-NEXT: std r6, 56(r1)
344 ; CHECK-NEXT: cmpwi cr0, r3, 1
345 ; CHECK-NEXT: std r4, 40(r1)
346 ; CHECK-NEXT: addis [[REG:r[0-9]+]], r2, .LCPI17_0@toc@ha
347 ; CHECK-NEXT: addi [[REG1:r[0-9]+]], [[REG]], .LCPI17_0@toc@l
348 ; CHECK-NEXT: lxvx v2, 0, [[REG1]]
349 ; CHECK-NEXT: std r5, 48(r1)
339 ; CHECK-DAG: std r10, 88(r1)
340 ; CHECK-DAG: std r9, 80(r1)
341 ; CHECK-DAG: std r8, 72(r1)
342 ; CHECK-DAG: std r7, 64(r1)
343 ; CHECK-DAG: std r6, 56(r1)
344 ; CHECK-DAG: std r4, 40(r1)
345 ; CHECK-DAG: cmpwi cr0, r3, 1
346 ; CHECK-DAG: std r5, 48(r1)
347 ; CHECK-DAG: addis [[REG:r[0-9]+]], r2, .LCPI17_0@toc@ha
348 ; CHECK-DAG: addi [[REG1:r[0-9]+]], [[REG]], .LCPI17_0@toc@l
349 ; CHECK-DAG: lxvx v2, 0, [[REG1]]
350350 ; CHECK-NEXT: bltlr cr0
351351 ; CHECK-NEXT: # %bb.1: # %if.end
352352 ; CHECK-NEXT: addi r3, r1, 40
3333 ret i64 %0
3434 }
3535 ; CHECK-LABEL: @callee2
36 ; CHECK: ld 3, 128(1)
36 ; CHECK: ld {{[0-9]+}}, 128(1)
3737 ; CHECK: blr
3838
3939 declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16)
172172 ; Setup frame pointer
173173 ; CHECK: add r7, sp, #8
174174 ; Register varargs stored via FP
175 ; CHECK: str r3, [r7, #16]
176 ; CHECK-NEXT: str r2, [r7, #12]
177 ; CHECK-NEXT: str r1, [r7, #8]
175 ; CHECK-DAG: str r3, [r7, #16]
176 ; CHECK-DAG: str r2, [r7, #12]
177 ; CHECK-DAG: str r1, [r7, #8]
178178
179179 ; Moving SP, access via SP
180180 ; int test_args_moving_sp(int a, int b, int c, int d, int e) {
193193
194194 define void @test128(i128* %a) {
195195 ; CHECK-LABEL: test128:
196 ; CHECK: ldr r1, [r0, #4]
196 ; CHECK: ldr r1, [r0, #8]
197197 ; CHECK-NEXT: ldr r2, .LCPI8_0
198198 ; CHECK-NEXT: eors r2, r1
199 ; CHECK-NEXT: str r2, [r0, #4]
199 ; CHECK-NEXT: str r2, [r0, #8]
200200 ; CHECK-NEXT: ldr r1, [r0]
201201 ; CHECK-NEXT: ldr r2, .LCPI8_1
202202 ; CHECK-NEXT: eors r2, r1
203203 ; CHECK-NEXT: str r2, [r0]
204 ; CHECK-NEXT: ldr r1, [r0, #8]
204 ; CHECK-NEXT: ldr r1, [r0, #4]
205205 ; CHECK-NEXT: ldr r2, .LCPI8_2
206206 ; CHECK-NEXT: eors r2, r1
207 ; CHECK-NEXT: str r2, [r0, #8]
207 ; CHECK-NEXT: str r2, [r0, #4]
208208 ; CHECK-NEXT: bx lr
209209 ; CHECK-NEXT: .p2align 2
210210 ; CHECK-NEXT: .LCPI8_0:
211 ; CHECK-NEXT: .long 4075008415
211 ; CHECK-NEXT: .long 6692605
212212 ; CHECK-NEXT: .LCPI8_1:
213213 ; CHECK-NEXT: .long 2080661269
214214 ; CHECK-NEXT: .LCPI8_2:
215 ; CHECK-NEXT: .long 6692605
215 ; CHECK-NEXT: .long 4075008415
216216 %x = load i128, i128* %a
217217 %xn = xor i128 %x, 123456789123456789123456789
218218 store i128 %xn, i128* %a
1616 store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8
1717 ; X32: leal 68(%esp), [[REG:%.*]]
1818 ; X32: movl [[REG]], 16(%esp)
19 ; X64: leaq 232(%rsp), [[REG:%.*]]
19 ; X64: leaq 256(%rsp), [[REG:%.*]]
2020 ; X64: movq [[REG]], 184(%rsp)
2121 ; X64: leaq 176(%rsp), %rdi
2222 call void @qux(%struct.__va_list_tag* %arraydecay)
131131 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
132132 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
133133 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
134 ; X32-NEXT: movl %edi, 12(%esi)
135 ; X32-NEXT: movl %edx, 8(%esi)
134 ; X32-NEXT: movl %edi, 8(%esi)
135 ; X32-NEXT: movl %edx, 12(%esi)
136 ; X32-NEXT: movl %eax, (%esi)
136137 ; X32-NEXT: movl %ecx, 4(%esi)
137 ; X32-NEXT: movl %eax, (%esi)
138138 ; X32-NEXT: movl %esi, %eax
139139 ; X32-NEXT: leal -8(%ebp), %esp
140140 ; X32-NEXT: popl %esi
244244 ; CHECK: # %bb.0:
245245 ; CHECK-NEXT: vmovups (%rsi), %xmm0
246246 ; CHECK-NEXT: vmovups 16(%rsi), %xmm1
247 ; CHECK-NEXT: vmovups %xmm1, 16(%rdi)
247248 ; CHECK-NEXT: vmovups %xmm0, (%rdi)
248 ; CHECK-NEXT: vmovups %xmm1, 16(%rdi)
249249 ; CHECK-NEXT: retq
250250 ;
251251 ; CHECK_O0-LABEL: add8i32:
289289 ; CHECK: # %bb.0:
290290 ; CHECK-NEXT: vmovaps (%rsi), %xmm0
291291 ; CHECK-NEXT: vmovaps 16(%rsi), %xmm1
292 ; CHECK-NEXT: vmovaps %xmm1, 16(%rdi)
292293 ; CHECK-NEXT: vmovaps %xmm0, (%rdi)
293 ; CHECK-NEXT: vmovaps %xmm1, 16(%rdi)
294294 ; CHECK-NEXT: retq
295295 ;
296296 ; CHECK_O0-LABEL: add4i64a16:
858858 ; X86-NEXT: .LBB33_2:
859859 ; X86-NEXT: notl %esi
860860 ; X86-NEXT: notl %edx
861 ; X86-NEXT: andl %edx, (%eax)
861862 ; X86-NEXT: andl %esi, 4(%eax)
862 ; X86-NEXT: andl %edx, (%eax)
863863 ; X86-NEXT: popl %esi
864864 ; X86-NEXT: .cfi_def_cfa_offset 4
865865 ; X86-NEXT: retl
898898 ; X86-NEXT: movl %edx, %esi
899899 ; X86-NEXT: xorl %edx, %edx
900900 ; X86-NEXT: .LBB34_2:
901 ; X86-NEXT: orl %edx, (%eax)
901902 ; X86-NEXT: orl %esi, 4(%eax)
902 ; X86-NEXT: orl %edx, (%eax)
903903 ; X86-NEXT: popl %esi
904904 ; X86-NEXT: .cfi_def_cfa_offset 4
905905 ; X86-NEXT: retl
937937 ; X86-NEXT: movl %edx, %esi
938938 ; X86-NEXT: xorl %edx, %edx
939939 ; X86-NEXT: .LBB35_2:
940 ; X86-NEXT: xorl %edx, (%eax)
940941 ; X86-NEXT: xorl %esi, 4(%eax)
941 ; X86-NEXT: xorl %edx, (%eax)
942942 ; X86-NEXT: popl %esi
943943 ; X86-NEXT: .cfi_def_cfa_offset 4
944944 ; X86-NEXT: retl
7676 ; X86-NEXT: movzbl %bl, %ecx
7777 ; X86-NEXT: subl %ecx, %edx
7878 ; X86-NEXT: sbbl $0, %ebp
79 ; X86-NEXT: movl %esi, (%eax)
7980 ; X86-NEXT: movl %edi, 4(%eax)
80 ; X86-NEXT: movl %esi, (%eax)
8181 ; X86-NEXT: movl %edx, 8(%eax)
8282 ; X86-NEXT: movl %ebp, 12(%eax)
8383 ; X86-NEXT: popl %esi
1818 ; CHECK-NEXT: movd %eax, %xmm0
1919 ; CHECK-NEXT: xorps %xmm1, %xmm1
2020 ; CHECK-NEXT: mulss %xmm0, %xmm1
21 ; CHECK-NEXT: movq $0, (%rdi)
21 ; CHECK-NEXT: movl $0, (%rdi)
2222 ; CHECK-NEXT: movss %xmm1, 4(%rdi)
2323 ; CHECK-NEXT: retq
2424 entry:
77 ; CHECK: # %bb.0:
88 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
99 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
10 ; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1
1011 ; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0
11 ; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1
12 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
1213 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
13 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
1414 ; CHECK-NEXT: vzeroupper
1515 ; CHECK-NEXT: retq
1616 %d = load <16 x i32>, <16 x i32>* %a
8484 ; CHECK: # %bb.0:
8585 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
8686 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
87 ; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1
8788 ; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0
88 ; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1
89 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
8990 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
90 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
9191 ; CHECK-NEXT: vzeroupper
9292 ; CHECK-NEXT: retq
9393 %A = load <32 x i16>, <32 x i16>* %APtr
127127 ; CHECK: # %bb.0:
128128 ; CHECK-NEXT: vmovdqa (%rdi), %ymm0
129129 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
130 ; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1
130131 ; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0
131 ; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1
132 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
132133 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
133 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
134134 ; CHECK-NEXT: vzeroupper
135135 ; CHECK-NEXT: retq
136136 %x = load <64 x i8>, <64 x i8>* %xptr
651651 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
652652 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2
653653 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3
654 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
655 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
654 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
655 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
656656 ; CHECK-NEXT: vpmullw %ymm4, %ymm5, %ymm4
657657 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
658658 ; CHECK-NEXT: vpand %ymm5, %ymm4, %ymm4
659 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
660 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
661 ; CHECK-NEXT: vpmullw %ymm3, %ymm1, %ymm1
662 ; CHECK-NEXT: vpand %ymm5, %ymm1, %ymm1
663 ; CHECK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
664 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
665 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
666 ; CHECK-NEXT: vpmullw %ymm3, %ymm4, %ymm3
667 ; CHECK-NEXT: vpand %ymm5, %ymm3, %ymm3
659668 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
660669 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
661670 ; CHECK-NEXT: vpmullw %ymm2, %ymm0, %ymm0
662671 ; CHECK-NEXT: vpand %ymm5, %ymm0, %ymm0
663 ; CHECK-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
664 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
665 ; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
666 ; CHECK-NEXT: vpmullw %ymm2, %ymm4, %ymm2
667 ; CHECK-NEXT: vpand %ymm5, %ymm2, %ymm2
668 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
669 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
670 ; CHECK-NEXT: vpmullw %ymm3, %ymm1, %ymm1
671 ; CHECK-NEXT: vpand %ymm5, %ymm1, %ymm1
672 ; CHECK-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
672 ; CHECK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
673 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
673674 ; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx)
674 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx)
675675 ; CHECK-NEXT: vzeroupper
676676 ; CHECK-NEXT: retq
677677 %d = load <64 x i8>, <64 x i8>* %a
5555 ; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
5656 ; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
5757 ; LINUX-NEXT: .LBB0_2:
58 ; LINUX-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
59 ; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp)
60 ; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp)
61 ; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp)
5862 ; LINUX-NEXT: movq %r15, {{[0-9]+}}(%rsp)
59 ; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp)
60 ; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp)
61 ; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp)
62 ; LINUX-NEXT: movq %rbx, {{[0-9]+}}(%rsp)
6363 ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax
6464 ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp)
6565 ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax
149149 ; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp)
150150 ; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp)
151151 ; LINUX-X32-NEXT: .LBB0_2:
152 ; LINUX-X32-NEXT: movq %rbx, {{[0-9]+}}(%esp)
153 ; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp)
154 ; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp)
155 ; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp)
152156 ; LINUX-X32-NEXT: movq %r15, {{[0-9]+}}(%esp)
153 ; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp)
154 ; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp)
155 ; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp)
156 ; LINUX-X32-NEXT: movq %rbx, {{[0-9]+}}(%esp)
157157 ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax
158158 ; LINUX-X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
159159 ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax
222222 ; WINDOWS-NEXT: movq %r8, %rdi
223223 ; WINDOWS-NEXT: movq %rdx, %rbx
224224 ; WINDOWS-NEXT: movq %rcx, %rbp
225 ; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
226 ; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp)
225227 ; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp)
226 ; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp)
227 ; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
228228 ; WINDOWS-NEXT: leaq {{[0-9]+}}(%rsp), %rax
229229 ; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp)
230230 ; WINDOWS-NEXT: callq get_f
4545 ; CHECK-LABEL: t4:
4646 ; CHECK: incl %[[r:.*]]
4747 ; CHECK: decl %[[n:.*]]
48 ; CHECK: movl %[[r]], {{[0-9]+}}(%esp)
49 ; CHECK: movl %[[n]], {{[0-9]+}}(%esp)
48 ; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%esp)
49 ; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%esp)
5050 ; CHECK: jmpl *%{{.*}}
5151
5252 entry:
7070 ; CHECK: incl %[[r:.*]]
7171 ; CHECK: decl %[[n:.*]]
7272 ; Store them through ebp, since that's the only stable arg pointer.
73 ; CHECK: movl %[[r]], {{[0-9]+}}(%ebp)
74 ; CHECK: movl %[[n]], {{[0-9]+}}(%ebp)
73 ; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%ebp)
74 ; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%ebp)
7575 ; Epilogue.
7676 ; CHECK: leal {{[-0-9]+}}(%ebp), %esp
7777 ; CHECK: popl %esi
145145 ; X32-NEXT: subl $48, %esp
146146 ; X32-NEXT: movl 8(%ebp), %eax
147147 ; X32-NEXT: movl 24(%eax), %ecx
148 ; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
148 ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
149149 ; X32-NEXT: movl 28(%eax), %ecx
150150 ; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
151151 ; X32-NEXT: movl 16(%eax), %esi
162162 ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
163163 ; X32-NEXT: movl (%esp), %eax # 4-byte Reload
164164 ; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
165 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
165 ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
166166 ; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
167167 ; X32-NEXT: movl 12(%ebp), %eax
168168 ; X32-NEXT: fildll {{[0-9]+}}(%esp)
276276 ; X32-NEXT: adcl 4(%ecx), %edx
277277 ; X32-NEXT: addl 8(%ecx), %edi
278278 ; X32-NEXT: adcl 12(%ecx), %esi
279 ; X32-NEXT: movl %edi, 8(%eax)
279280 ; X32-NEXT: movl %esi, 12(%eax)
280 ; X32-NEXT: movl %edi, 8(%eax)
281 ; X32-NEXT: movl %ebx, (%eax)
281282 ; X32-NEXT: movl %edx, 4(%eax)
282 ; X32-NEXT: movl %ebx, (%eax)
283283 ; X32-NEXT: popl %esi
284284 ; X32-NEXT: popl %edi
285285 ; X32-NEXT: popl %ebx
14961496 define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
14971497 ; SSE2-LABEL: interleave_24i32_in:
14981498 ; SSE2: # %bb.0:
1499 ; SSE2-NEXT: movups (%rsi), %xmm5
1500 ; SSE2-NEXT: movups 16(%rsi), %xmm8
1501 ; SSE2-NEXT: movups (%rdx), %xmm6
1502 ; SSE2-NEXT: movups 16(%rdx), %xmm3
1503 ; SSE2-NEXT: movups (%rcx), %xmm0
1504 ; SSE2-NEXT: movups 16(%rcx), %xmm4
1499 ; SSE2-NEXT: movups (%rsi), %xmm1
1500 ; SSE2-NEXT: movups 16(%rsi), %xmm0
1501 ; SSE2-NEXT: movups (%rdx), %xmm8
1502 ; SSE2-NEXT: movups 16(%rdx), %xmm5
1503 ; SSE2-NEXT: movups (%rcx), %xmm3
1504 ; SSE2-NEXT: movups 16(%rcx), %xmm6
1505 ; SSE2-NEXT: movaps %xmm3, %xmm7
1506 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm1[1,0]
1507 ; SSE2-NEXT: movaps %xmm1, %xmm9
1508 ; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
1509 ; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[0,2]
1510 ; SSE2-NEXT: movaps %xmm5, %xmm7
1511 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2],xmm6[3,2]
1512 ; SSE2-NEXT: movaps %xmm6, %xmm4
1513 ; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1514 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,2]
15051515 ; SSE2-NEXT: movaps %xmm0, %xmm7
1506 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm5[1,0]
1507 ; SSE2-NEXT: movaps %xmm5, %xmm1
1508 ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
1509 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2]
1510 ; SSE2-NEXT: movaps %xmm5, %xmm7
1511 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,1]
1512 ; SSE2-NEXT: movaps %xmm0, %xmm2
1513 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[1,0]
1516 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1]
1517 ; SSE2-NEXT: movaps %xmm6, %xmm2
1518 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[1,0]
15141519 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2]
1515 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm0[3,2]
1516 ; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1517 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,2]
1518 ; SSE2-NEXT: movaps %xmm4, %xmm5
1519 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[1,0]
1520 ; SSE2-NEXT: movaps %xmm8, %xmm6
1521 ; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1522 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2]
1520 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm0[1,0]
1521 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
1522 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2]
15231523 ; SSE2-NEXT: movaps %xmm8, %xmm5
1524 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1]
1525 ; SSE2-NEXT: movaps %xmm4, %xmm7
1526 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,0]
1527 ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[0,2]
1528 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2],xmm4[3,2]
1529 ; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3]
1530 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,2]
1524 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2],xmm3[3,2]
1525 ; SSE2-NEXT: movaps %xmm3, %xmm6
1526 ; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
1527 ; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[0,2]
1528 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1]
1529 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[1,0]
1530 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2]
1531 ; SSE2-NEXT: movups %xmm3, 16(%rdi)
1532 ; SSE2-NEXT: movups %xmm6, 32(%rdi)
1533 ; SSE2-NEXT: movups %xmm0, 48(%rdi)
1534 ; SSE2-NEXT: movups %xmm2, 64(%rdi)
15311535 ; SSE2-NEXT: movups %xmm4, 80(%rdi)
1532 ; SSE2-NEXT: movups %xmm7, 64(%rdi)
1533 ; SSE2-NEXT: movups %xmm6, 48(%rdi)
1534 ; SSE2-NEXT: movups %xmm0, 32(%rdi)
1535 ; SSE2-NEXT: movups %xmm2, 16(%rdi)
1536 ; SSE2-NEXT: movups %xmm1, (%rdi)
1536 ; SSE2-NEXT: movups %xmm9, (%rdi)
15371537 ; SSE2-NEXT: retq
15381538 ;
15391539 ; SSE42-LABEL: interleave_24i32_in:
15401540 ; SSE42: # %bb.0:
1541 ; SSE42-NEXT: movdqu (%rsi), %xmm5
1542 ; SSE42-NEXT: movdqu 16(%rsi), %xmm2
1543 ; SSE42-NEXT: movdqu (%rdx), %xmm6
1544 ; SSE42-NEXT: movdqu 16(%rdx), %xmm1
1545 ; SSE42-NEXT: movdqu (%rcx), %xmm7
1546 ; SSE42-NEXT: movdqu 16(%rcx), %xmm4
1547 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1]
1548 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
1549 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7]
1550 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1]
1551 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
1552 ; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2]
1553 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7]
1554 ; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5],xmm3[6,7]
1555 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
1556 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
1557 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7]
1558 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3]
1559 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5],xmm7[6,7]
1560 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1]
1561 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,0,1]
1562 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
1563 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1]
1564 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7]
1565 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,2,2]
1566 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
1567 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7]
1568 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1569 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
1570 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
1571 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
1572 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7]
1573 ; SSE42-NEXT: movdqu %xmm1, 80(%rdi)
1574 ; SSE42-NEXT: movdqu %xmm7, 64(%rdi)
1575 ; SSE42-NEXT: movdqu %xmm6, 48(%rdi)
1576 ; SSE42-NEXT: movdqu %xmm5, 32(%rdi)
1577 ; SSE42-NEXT: movdqu %xmm3, 16(%rdi)
1578 ; SSE42-NEXT: movdqu %xmm0, (%rdi)
1541 ; SSE42-NEXT: movdqu (%rsi), %xmm8
1542 ; SSE42-NEXT: movdqu 16(%rsi), %xmm4
1543 ; SSE42-NEXT: movdqu (%rdx), %xmm2
1544 ; SSE42-NEXT: movdqu 16(%rdx), %xmm5
1545 ; SSE42-NEXT: movdqu (%rcx), %xmm3
1546 ; SSE42-NEXT: movdqu 16(%rcx), %xmm6
1547 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1]
1548 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,1,0,1]
1549 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7]
1550 ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
1551 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7]
1552 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
1553 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
1554 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7]
1555 ; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,3,3]
1556 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7]
1557 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2]
1558 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7]
1559 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7]
1560 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
1561 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
1562 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7]
1563 ; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1]
1564 ; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5],xmm4[6,7]
1565 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1]
1566 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
1567 ; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
1568 ; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3]
1569 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5],xmm6[6,7]
1570 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
1571 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
1572 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4,5],xmm2[6,7]
1573 ; SSE42-NEXT: movdqu %xmm2, 16(%rdi)
1574 ; SSE42-NEXT: movdqu %xmm4, 32(%rdi)
1575 ; SSE42-NEXT: movdqu %xmm5, 48(%rdi)
1576 ; SSE42-NEXT: movdqu %xmm0, 64(%rdi)
1577 ; SSE42-NEXT: movdqu %xmm7, 80(%rdi)
1578 ; SSE42-NEXT: movdqu %xmm1, (%rdi)
15791579 ; SSE42-NEXT: retq
15801580 ;
15811581 ; AVX1-LABEL: interleave_24i32_in:
15821582 ; AVX1: # %bb.0:
15831583 ; AVX1-NEXT: vmovupd (%rsi), %ymm0
15841584 ; AVX1-NEXT: vmovupd (%rcx), %ymm1
1585 ; AVX1-NEXT: vmovups (%rdx), %xmm2
1586 ; AVX1-NEXT: vmovups 16(%rdx), %xmm3
1585 ; AVX1-NEXT: vmovups 16(%rcx), %xmm2
1586 ; AVX1-NEXT: vmovups (%rdx), %xmm3
1587 ; AVX1-NEXT: vmovups 16(%rdx), %xmm4
1588 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0]
1589 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2]
1590 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0]
1591 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2]
1592 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1593 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
1594 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
1595 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
15871596 ; AVX1-NEXT: vmovups (%rsi), %xmm4
1588 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0]
1589 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
1590 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0]
1591 ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
1592 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1597 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0]
1598 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2]
1599 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
1600 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
1601 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
15931602 ; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
15941603 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
1595 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
1596 ; AVX1-NEXT: vmovups 16(%rcx), %xmm4
1597 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0]
1598 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2]
1599 ; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0]
1600 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2]
1601 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
1602 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
1603 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
16041604 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
16051605 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
16061606 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
16081608 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
16091609 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
16101610 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
1611 ; AVX1-NEXT: vmovups %ymm3, 64(%rdi)
1612 ; AVX1-NEXT: vmovups %ymm2, (%rdi)
1611 ; AVX1-NEXT: vmovups %ymm3, (%rdi)
1612 ; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
16131613 ; AVX1-NEXT: vzeroupper
16141614 ; AVX1-NEXT: retq
16151615 ;
16521652 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
16531653 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3]
16541654 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
1655 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2]
1656 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
1657 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
1658 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
1659 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
16551660 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
1656 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4
1657 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1]
1658 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
1659 ; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm5
1660 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
1661 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1662 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
1663 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
1664 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
1661 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
1662 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
1663 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1664 ; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm1
16651665 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1666 ; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi)
1667 ; AVX2-FAST-NEXT: vmovups %ymm4, (%rdi)
1666 ; AVX2-FAST-NEXT: vmovups %ymm0, (%rdi)
1667 ; AVX2-FAST-NEXT: vmovups %ymm2, 32(%rdi)
16681668 ; AVX2-FAST-NEXT: vmovups %ymm3, 64(%rdi)
16691669 ; AVX2-FAST-NEXT: vzeroupper
16701670 ; AVX2-FAST-NEXT: retq
16731673 ; XOP: # %bb.0:
16741674 ; XOP-NEXT: vmovupd (%rsi), %ymm0
16751675 ; XOP-NEXT: vmovups (%rcx), %ymm1
1676 ; XOP-NEXT: vmovups (%rdx), %xmm2
1677 ; XOP-NEXT: vmovups 16(%rdx), %xmm3
1676 ; XOP-NEXT: vmovups 16(%rcx), %xmm2
1677 ; XOP-NEXT: vmovups (%rdx), %xmm3
1678 ; XOP-NEXT: vmovups 16(%rdx), %xmm4
1679 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0]
1680 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2]
1681 ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0]
1682 ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2]
1683 ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1684 ; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
1685 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
1686 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
16781687 ; XOP-NEXT: vmovups (%rsi), %xmm4
1679 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0]
1680 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
1681 ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0]
1682 ; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
1683 ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1688 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0]
1689 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2]
1690 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
1691 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
1692 ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
16841693 ; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
16851694 ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
1686 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
1687 ; XOP-NEXT: vmovups 16(%rcx), %xmm4
1688 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0]
1689 ; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2]
1690 ; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0]
1691 ; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2]
1692 ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
1693 ; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
1694 ; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
16951695 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
16961696 ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5]
16971697 ; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
16981698 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
16991699 ; XOP-NEXT: vmovups %ymm0, 32(%rdi)
1700 ; XOP-NEXT: vmovups %ymm3, 64(%rdi)
1701 ; XOP-NEXT: vmovups %ymm2, (%rdi)
1700 ; XOP-NEXT: vmovups %ymm3, (%rdi)
1701 ; XOP-NEXT: vmovups %ymm2, 64(%rdi)
17021702 ; XOP-NEXT: vzeroupper
17031703 ; XOP-NEXT: retq
17041704 %s1 = load <8 x i32>, <8 x i32>* %q1, align 4
1111 ; CHECK-NEXT: movq (%rdi), %rax
1212 ; CHECK-NEXT: xorps %xmm0, %xmm0
1313 ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
14 ; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp)
14 ; CHECK-NEXT: movq $170, {{[0-9]+}}(%rsp)
1515 ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [12297829382473034410,12297829382473034410]
1616 ; CHECK-NEXT: movaps %xmm0, (%rsp)
1717 ; CHECK-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA
1818 ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
1919 ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
20 ; CHECK-NEXT: movb $-86, {{[0-9]+}}(%rsp)
2120 ; CHECK-NEXT: movzwl 2(%rax), %ecx
2221 ; CHECK-NEXT: andl $8191, %ecx # imm = 0x1FFF
2322 ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp)
571571 ; X86-NEXT: movl %edx, %esi
572572 ; X86-NEXT: shldl $31, %ecx, %esi
573573 ; X86-NEXT: shldl $31, %edx, %ecx
574 ; X86-NEXT: movl %esi, (%eax)
574575 ; X86-NEXT: movl %ecx, 4(%eax)
575 ; X86-NEXT: movl %esi, (%eax)
576576 ; X86-NEXT: popl %esi
577577 ; X86-NEXT: retl
578578 ;
243243 ; X86-NEXT: .cfi_offset %ebp, -8
244244 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
245245 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
246 ; X86-NEXT: movl (%eax), %edx
246 ; X86-NEXT: movl (%eax), %esi
247247 ; X86-NEXT: movl 4(%eax), %ebx
248 ; X86-NEXT: movl %edx, %esi
249 ; X86-NEXT: shll %cl, %esi
248 ; X86-NEXT: movl %esi, %edx
249 ; X86-NEXT: shll %cl, %edx
250250 ; X86-NEXT: movl %ebx, %edi
251 ; X86-NEXT: shldl %cl, %edx, %edi
251 ; X86-NEXT: shldl %cl, %esi, %edi
252252 ; X86-NEXT: testb $32, %cl
253253 ; X86-NEXT: je .LBB6_2
254254 ; X86-NEXT: # %bb.1:
255 ; X86-NEXT: movl %esi, %edi
256 ; X86-NEXT: xorl %esi, %esi
255 ; X86-NEXT: movl %edx, %edi
256 ; X86-NEXT: xorl %edx, %edx
257257 ; X86-NEXT: .LBB6_2:
258258 ; X86-NEXT: negb %cl
259259 ; X86-NEXT: movl %ebx, %ebp
260260 ; X86-NEXT: shrl %cl, %ebp
261 ; X86-NEXT: shrdl %cl, %ebx, %edx
261 ; X86-NEXT: shrdl %cl, %ebx, %esi
262262 ; X86-NEXT: testb $32, %cl
263263 ; X86-NEXT: je .LBB6_4
264264 ; X86-NEXT: # %bb.3:
265 ; X86-NEXT: movl %ebp, %edx
265 ; X86-NEXT: movl %ebp, %esi
266266 ; X86-NEXT: xorl %ebp, %ebp
267267 ; X86-NEXT: .LBB6_4:
268 ; X86-NEXT: orl %esi, %edx
268269 ; X86-NEXT: orl %ebp, %edi
269 ; X86-NEXT: orl %edx, %esi
270 ; X86-NEXT: movl %edx, (%eax)
270271 ; X86-NEXT: movl %edi, 4(%eax)
271 ; X86-NEXT: movl %esi, (%eax)
272272 ; X86-NEXT: popl %esi
273273 ; X86-NEXT: .cfi_def_cfa_offset 16
274274 ; X86-NEXT: popl %edi
335335 ; X86-NEXT: movl %ebp, %esi
336336 ; X86-NEXT: xorl %ebp, %ebp
337337 ; X86-NEXT: .LBB7_4:
338 ; X86-NEXT: orl %ebp, %edi
338339 ; X86-NEXT: orl %esi, %edx
339 ; X86-NEXT: orl %ebp, %edi
340 ; X86-NEXT: movl %edi, (%eax)
340341 ; X86-NEXT: movl %edx, 4(%eax)
341 ; X86-NEXT: movl %edi, (%eax)
342342 ; X86-NEXT: popl %esi
343343 ; X86-NEXT: .cfi_def_cfa_offset 16
344344 ; X86-NEXT: popl %edi
459459 ; AVX1: # %bb.0:
460460 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
461461 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
462 ; AVX1-NEXT: vpaddsw 16(%rsi), %xmm1, %xmm1
462463 ; AVX1-NEXT: vpaddsw (%rsi), %xmm0, %xmm0
463 ; AVX1-NEXT: vpaddsw 16(%rsi), %xmm1, %xmm1
464 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
464465 ; AVX1-NEXT: vmovq %xmm1, 16(%rdx)
465 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
466466 ; AVX1-NEXT: retq
467467 ;
468468 ; AVX2-LABEL: v12i16:
143143 ; X32-NEXT: movl %esi, %edx
144144 ; X32-NEXT: xorl %esi, %esi
145145 ; X32-NEXT: .LBB5_2:
146 ; X32-NEXT: movl %edx, (%eax)
146147 ; X32-NEXT: movl %esi, 4(%eax)
147 ; X32-NEXT: movl %edx, (%eax)
148148 ; X32-NEXT: popl %esi
149149 ; X32-NEXT: popl %edi
150150 ; X32-NEXT: retl
745745 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4
746746 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
747747 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
748 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
749 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
748 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
749 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
750750 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
751751 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4
752752 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
753753 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
754 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
755 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
756 ; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
757 ; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
758 ; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
759 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
754 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
755 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
756 ; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
757 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
758 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
759 ; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
760760 ; X86-SSE-NEXT: popl %esi
761761 ; X86-SSE-NEXT: retl
762762 ;
817817 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
818818 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
819819 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
820 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
821 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
820 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
821 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
822822 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
823823 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
824824 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
825825 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
826 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
827 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
828 ; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
829 ; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
830 ; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
831 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
826 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
827 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
828 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
829 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
830 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
831 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
832832 ; X64-SSE-NEXT: retq
833833 ;
834834 ; X64-AVX1-LABEL: mul_16xi16:
12611261 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4
12621262 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
12631263 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
1264 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1265 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1264 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1265 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
12661266 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
12671267 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4
12681268 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
12691269 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
1270 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1271 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1272 ; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
1273 ; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
1274 ; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
1275 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
1270 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1271 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1272 ; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
1273 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
1274 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
1275 ; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
12761276 ; X86-SSE-NEXT: popl %esi
12771277 ; X86-SSE-NEXT: retl
12781278 ;
13331333 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
13341334 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
13351335 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
1336 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1337 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1336 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1337 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
13381338 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
13391339 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
13401340 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
13411341 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
1342 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1343 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1344 ; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
1345 ; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
1346 ; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
1347 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
1342 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1343 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1344 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
1345 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
1346 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
1347 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
13481348 ; X64-SSE-NEXT: retq
13491349 ;
13501350 ; X64-AVX1-LABEL: mul_16xi16_sext:
739739 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4
740740 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
741741 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
742 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
743 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
742 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
743 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
744744 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
745745 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4
746746 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
747747 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
748 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
749 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
750 ; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
751 ; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
752 ; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
753 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
748 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
749 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
750 ; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
751 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
752 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
753 ; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
754754 ; X86-SSE-NEXT: popl %esi
755755 ; X86-SSE-NEXT: retl
756756 ;
811811 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
812812 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
813813 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
814 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
815 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
814 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
815 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
816816 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
817817 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
818818 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
819819 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
820 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
821 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
822 ; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
823 ; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
824 ; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
825 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
820 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
821 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
822 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
823 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
824 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
825 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
826826 ; X64-SSE-NEXT: retq
827827 ;
828828 ; X64-AVX1-LABEL: mul_16xi16:
12391239 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4
12401240 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2
12411241 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0
1242 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1243 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1242 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1243 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
12441244 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4
12451245 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4
12461246 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3
12471247 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1
1248 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1249 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1250 ; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
1251 ; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
1252 ; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
1253 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
1248 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1249 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1250 ; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
1251 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
1252 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
1253 ; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
12541254 ; X86-SSE-NEXT: popl %esi
12551255 ; X86-SSE-NEXT: retl
12561256 ;
13111311 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
13121312 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2
13131313 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0
1314 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1315 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1314 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1315 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
13161316 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4
13171317 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
13181318 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3
13191319 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1
1320 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1321 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1322 ; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
1323 ; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
1324 ; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
1325 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
1320 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1321 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1322 ; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
1323 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
1324 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
1325 ; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
13261326 ; X64-SSE-NEXT: retq
13271327 ;
13281328 ; X64-AVX1-LABEL: mul_16xi16_sext:
459459 ; AVX1: # %bb.0:
460460 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
461461 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
462 ; AVX1-NEXT: vpsubsw 16(%rsi), %xmm1, %xmm1
462463 ; AVX1-NEXT: vpsubsw (%rsi), %xmm0, %xmm0
463 ; AVX1-NEXT: vpsubsw 16(%rsi), %xmm1, %xmm1
464 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
464465 ; AVX1-NEXT: vmovq %xmm1, 16(%rdx)
465 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
466466 ; AVX1-NEXT: retq
467467 ;
468468 ; AVX2-LABEL: v12i16:
459459 ; AVX1: # %bb.0:
460460 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
461461 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
462 ; AVX1-NEXT: vpaddusw 16(%rsi), %xmm1, %xmm1
462463 ; AVX1-NEXT: vpaddusw (%rsi), %xmm0, %xmm0
463 ; AVX1-NEXT: vpaddusw 16(%rsi), %xmm1, %xmm1
464 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
464465 ; AVX1-NEXT: vmovq %xmm1, 16(%rdx)
465 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
466466 ; AVX1-NEXT: retq
467467 ;
468468 ; AVX2-LABEL: v12i16:
459459 ; AVX1: # %bb.0:
460460 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
461461 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
462 ; AVX1-NEXT: vpsubusw 16(%rsi), %xmm1, %xmm1
462463 ; AVX1-NEXT: vpsubusw (%rsi), %xmm0, %xmm0
463 ; AVX1-NEXT: vpsubusw 16(%rsi), %xmm1, %xmm1
464 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
464465 ; AVX1-NEXT: vmovq %xmm1, 16(%rdx)
465 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
466466 ; AVX1-NEXT: retq
467467 ;
468468 ; AVX2-LABEL: v12i16:
2020 ; CHECK-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
2121 ; CHECK-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
2222 ; CHECK-NEXT: LBB0_2: ## %entry
23 ; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
24 ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
25 ; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
26 ; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
2327 ; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
24 ; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
25 ; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
26 ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
27 ; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
2828 ; CHECK-NEXT: xorl %eax, %eax
2929 ; CHECK-NEXT: testl $512, %edi ## imm = 0x200
3030 ; CHECK-NEXT: je LBB0_4
185185 ; X32-SSE: # %bb.0: # %entry
186186 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
187187 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
188 ; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01]
189 ; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1 # encoding: [0x0f,0x5a,0x49,0x08]
190 ; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x10]
191 ; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x18]
192 ; X32-SSE-NEXT: movups %xmm3, 48(%eax) # encoding: [0x0f,0x11,0x58,0x30]
193 ; X32-SSE-NEXT: movups %xmm2, 32(%eax) # encoding: [0x0f,0x11,0x50,0x20]
194 ; X32-SSE-NEXT: movups %xmm1, 16(%eax) # encoding: [0x0f,0x11,0x48,0x10]
195 ; X32-SSE-NEXT: movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00]
188 ; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm0 # encoding: [0x0f,0x5a,0x41,0x08]
189 ; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm1 # encoding: [0x0f,0x5a,0x09]
190 ; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x18]
191 ; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x10]
192 ; X32-SSE-NEXT: movups %xmm3, 32(%eax) # encoding: [0x0f,0x11,0x58,0x20]
193 ; X32-SSE-NEXT: movups %xmm2, 48(%eax) # encoding: [0x0f,0x11,0x50,0x30]
194 ; X32-SSE-NEXT: movups %xmm1, (%eax) # encoding: [0x0f,0x11,0x08]
195 ; X32-SSE-NEXT: movups %xmm0, 16(%eax) # encoding: [0x0f,0x11,0x40,0x10]
196196 ; X32-SSE-NEXT: retl # encoding: [0xc3]
197197 ;
198198 ; X32-AVX-LABEL: fpext_frommem8:
217217 ;
218218 ; X64-SSE-LABEL: fpext_frommem8:
219219 ; X64-SSE: # %bb.0: # %entry
220 ; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0 # encoding: [0x0f,0x5a,0x07]
221 ; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1 # encoding: [0x0f,0x5a,0x4f,0x08]
222 ; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x10]
223 ; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x18]
224 ; X64-SSE-NEXT: movups %xmm3, 48(%rsi) # encoding: [0x0f,0x11,0x5e,0x30]
225 ; X64-SSE-NEXT: movups %xmm2, 32(%rsi) # encoding: [0x0f,0x11,0x56,0x20]
226 ; X64-SSE-NEXT: movups %xmm1, 16(%rsi) # encoding: [0x0f,0x11,0x4e,0x10]
227 ; X64-SSE-NEXT: movups %xmm0, (%rsi) # encoding: [0x0f,0x11,0x06]
220 ; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm0 # encoding: [0x0f,0x5a,0x47,0x08]
221 ; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm1 # encoding: [0x0f,0x5a,0x0f]
222 ; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x18]
223 ; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x10]
224 ; X64-SSE-NEXT: movups %xmm3, 32(%rsi) # encoding: [0x0f,0x11,0x5e,0x20]
225 ; X64-SSE-NEXT: movups %xmm2, 48(%rsi) # encoding: [0x0f,0x11,0x56,0x30]
226 ; X64-SSE-NEXT: movups %xmm1, (%rsi) # encoding: [0x0f,0x11,0x0e]
227 ; X64-SSE-NEXT: movups %xmm0, 16(%rsi) # encoding: [0x0f,0x11,0x46,0x10]
228228 ; X64-SSE-NEXT: retq # encoding: [0xc3]
229229 ;
230230 ; X64-AVX-LABEL: fpext_frommem8:
2020 ; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2
2121 ; CHECK-NEXT: psubw %xmm0, %xmm1
2222 ; CHECK-NEXT: psubw %xmm0, %xmm2
23 ; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax)
24 ; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax)
2325 ; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax)
24 ; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax)
25 ; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax)
2626 ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax)
2727 ; CHECK-NEXT: incl (%esp)
2828 ; CHECK-NEXT: cmpl $3, (%esp)
4646 ; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1
4747 ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1
4848 ; X86-NEXT: paddd %xmm0, %xmm1
49 ; X86-NEXT: pextrd $1, %xmm1, 4(%eax)
4950 ; X86-NEXT: pextrd $2, %xmm1, 8(%eax)
50 ; X86-NEXT: pextrd $1, %xmm1, 4(%eax)
5151 ; X86-NEXT: movd %xmm1, (%eax)
5252 ; X86-NEXT: retl $4
5353 ;
8080 ; X86-NEXT: movdqa 16(%edx), %xmm1
8181 ; X86-NEXT: paddd (%ecx), %xmm0
8282 ; X86-NEXT: paddd 16(%ecx), %xmm1
83 ; X86-NEXT: movd %xmm1, 16(%eax)
84 ; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
8385 ; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
84 ; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
85 ; X86-NEXT: movd %xmm1, 16(%eax)
8686 ; X86-NEXT: movdqa %xmm0, (%eax)
8787 ; X86-NEXT: retl $4
8888 ;
9393 ; X64-NEXT: movdqa 16(%rsi), %xmm1
9494 ; X64-NEXT: paddd (%rdx), %xmm0
9595 ; X64-NEXT: paddd 16(%rdx), %xmm1
96 ; X64-NEXT: movq %xmm1, 16(%rdi)
9697 ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi)
97 ; X64-NEXT: movq %xmm1, 16(%rdi)
9898 ; X64-NEXT: movdqa %xmm0, (%rdi)
9999 ; X64-NEXT: retq
100100 %a = load %i32vec7, %i32vec7* %ap, align 16
115115 ; X86-NEXT: movdqa (%edx), %xmm1
116116 ; X86-NEXT: movdqa 16(%edx), %xmm2
117117 ; X86-NEXT: paddd (%ecx), %xmm1
118 ; X86-NEXT: paddd 32(%ecx), %xmm0
118119 ; X86-NEXT: paddd 16(%ecx), %xmm2
119 ; X86-NEXT: paddd 32(%ecx), %xmm0
120 ; X86-NEXT: movdqa %xmm2, 16(%eax)
120121 ; X86-NEXT: movdqa %xmm0, 32(%eax)
121 ; X86-NEXT: movdqa %xmm2, 16(%eax)
122122 ; X86-NEXT: movdqa %xmm1, (%eax)
123123 ; X86-NEXT: retl $4
124124 ;
129129 ; X64-NEXT: movdqa 16(%rsi), %xmm1
130130 ; X64-NEXT: movdqa 32(%rsi), %xmm2
131131 ; X64-NEXT: paddd (%rdx), %xmm0
132 ; X64-NEXT: paddd 32(%rdx), %xmm2
132133 ; X64-NEXT: paddd 16(%rdx), %xmm1
133 ; X64-NEXT: paddd 32(%rdx), %xmm2
134 ; X64-NEXT: movdqa %xmm1, 16(%rdi)
134135 ; X64-NEXT: movdqa %xmm2, 32(%rdi)
135 ; X64-NEXT: movdqa %xmm1, 16(%rdi)
136136 ; X64-NEXT: movdqa %xmm0, (%rdi)
137137 ; X64-NEXT: retq
138138 %a = load %i32vec12, %i32vec12* %ap, align 16
224224 ; X86-NEXT: movdqa 16(%edx), %xmm1
225225 ; X86-NEXT: paddw (%ecx), %xmm0
226226 ; X86-NEXT: paddw 16(%ecx), %xmm1
227 ; X86-NEXT: movd %xmm1, 16(%eax)
227228 ; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
228 ; X86-NEXT: movd %xmm1, 16(%eax)
229229 ; X86-NEXT: movdqa %xmm0, (%eax)
230230 ; X86-NEXT: retl $4
231231 ;
257257 ; X86-NEXT: movdqa (%edx), %xmm1
258258 ; X86-NEXT: movdqa 16(%edx), %xmm2
259259 ; X86-NEXT: paddw (%ecx), %xmm1
260 ; X86-NEXT: paddw 32(%ecx), %xmm0
260261 ; X86-NEXT: paddw 16(%ecx), %xmm2
261 ; X86-NEXT: paddw 32(%ecx), %xmm0
262 ; X86-NEXT: movdqa %xmm2, 16(%eax)
262263 ; X86-NEXT: movd %xmm0, 32(%eax)
263 ; X86-NEXT: movdqa %xmm2, 16(%eax)
264264 ; X86-NEXT: movdqa %xmm1, (%eax)
265265 ; X86-NEXT: retl $4
266266 ;
271271 ; X64-NEXT: movdqa 16(%rsi), %xmm1
272272 ; X64-NEXT: movdqa 32(%rsi), %xmm2
273273 ; X64-NEXT: paddw (%rdx), %xmm0
274 ; X64-NEXT: paddw 32(%rdx), %xmm2
274275 ; X64-NEXT: paddw 16(%rdx), %xmm1
275 ; X64-NEXT: paddw 32(%rdx), %xmm2
276 ; X64-NEXT: movdqa %xmm1, 16(%rdi)
276277 ; X64-NEXT: movd %xmm2, 32(%rdi)
277 ; X64-NEXT: movdqa %xmm1, 16(%rdi)
278278 ; X64-NEXT: movdqa %xmm0, (%rdi)
279279 ; X64-NEXT: retq
280280 %a = load %i16vec18, %i16vec18* %ap, align 16
330330 ; X86-NEXT: movdqa 16(%edx), %xmm1
331331 ; X86-NEXT: paddb (%ecx), %xmm0
332332 ; X86-NEXT: paddb 16(%ecx), %xmm1
333 ; X86-NEXT: movd %xmm1, 16(%eax)
334 ; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
335 ; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
336 ; X86-NEXT: pextrw $6, %xmm1, 28(%eax)
333337 ; X86-NEXT: pextrb $14, %xmm1, 30(%eax)
334 ; X86-NEXT: pextrw $6, %xmm1, 28(%eax)
335 ; X86-NEXT: pextrd $2, %xmm1, 24(%eax)
336 ; X86-NEXT: pextrd $1, %xmm1, 20(%eax)
337 ; X86-NEXT: movd %xmm1, 16(%eax)
338338 ; X86-NEXT: movdqa %xmm0, (%eax)
339339 ; X86-NEXT: retl $4
340340 ;
345345 ; X64-NEXT: movdqa 16(%rsi), %xmm1
346346 ; X64-NEXT: paddb (%rdx), %xmm0
347347 ; X64-NEXT: paddb 16(%rdx), %xmm1
348 ; X64-NEXT: movq %xmm1, 16(%rdi)
349 ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi)
350 ; X64-NEXT: pextrw $6, %xmm1, 28(%rdi)
348351 ; X64-NEXT: pextrb $14, %xmm1, 30(%rdi)
349 ; X64-NEXT: pextrw $6, %xmm1, 28(%rdi)
350 ; X64-NEXT: pextrd $2, %xmm1, 24(%rdi)
351 ; X64-NEXT: movq %xmm1, 16(%rdi)
352352 ; X64-NEXT: movdqa %xmm0, (%rdi)
353353 ; X64-NEXT: retq
354354 %a = load %i8vec31, %i8vec31* %ap, align 16
2828 ; ALL-NEXT: movq %rsp, %rbp
2929 ; ALL-NEXT: .seh_setframe 5, 0
3030 ; ALL-NEXT: .seh_endprologue
31 ; ALL-NEXT: movq %rdx, 32(%rbp)
32 ; ALL-NEXT: movq %r8, 40(%rbp)
3133 ; ALL-NEXT: movq %r9, 48(%rbp)
32 ; ALL-NEXT: movq %r8, 40(%rbp)
33 ; ALL-NEXT: movq %rdx, 32(%rbp)
3434 ; ALL-NEXT: leaq 32(%rbp), %rax
3535 ; ALL-NEXT: movq %rax, (%rbp)
3636 ; ALL-NEXT: addq $8, %rsp
55 define void @average_va(i32 %count, ...) nounwind {
66 entry:
77 ; CHECK: pushq
8 ; CHECK: movq %r9, 40(%rsp)
9 ; CHECK: movq %r8, 32(%rsp)
10 ; CHECK: movq %rdx, 24(%rsp)
8 ; CHECK-DAG: movq %r9, 40(%rsp)
9 ; CHECK-DAG: movq %r8, 32(%rsp)
10 ; CHECK-DAG: movq %rdx, 24(%rsp)
1111 ; CHECK: leaq 24(%rsp), %rax
1212
1313 %ap = alloca i8*, align 8 ; [#uses=1]
55 define win64cc void @average_va(i32 %count, ...) nounwind {
66 entry:
77 ; CHECK: pushq
8 ; CHECK: movq %r9, 40(%rsp)
9 ; CHECK: movq %r8, 32(%rsp)
10 ; CHECK: movq %rdx, 24(%rsp)
8 ; CHECK-DAG: movq %r9, 40(%rsp)
9 ; CHECK-DAG: movq %r8, 32(%rsp)
10 ; CHECK-DAG: movq %rdx, 24(%rsp)
1111 ; CHECK: leaq 24(%rsp), %rax
1212
1313 %ap = alloca i8*, align 8 ; [#uses=1]
5858
5959 ; CHECK-LABEL: copy1:
6060 ; CHECK: leaq 32(%rsp), [[REG_copy1:%[a-z]+]]
61 ; CHECK: movq [[REG_copy1]], 8(%rsp)
62 ; CHECK: movq [[REG_copy1]], (%rsp)
61 ; CHECK-DAG: movq [[REG_copy1]], 8(%rsp)
62 ; CHECK-DAG: movq [[REG_copy1]], (%rsp)
6363 ; CHECK: ret
6464 define win64cc void @copy1(i64 %a0, ...) nounwind {
6565 entry:
3838 ; CHECK: extsp 4
3939 ; CHECK: stw lr, sp[1]
4040 ; CHECK: mov r11, r1
41 ; CHECK: stw r2, sp[3]
42 ; CHECK: stw r3, sp[4]
41 ; CHECK-DAG: stw r2, sp[3]
42 ; CHECK-DAG: stw r3, sp[4]
4343 ; CHECK: ldw r0, r0[0]
4444 ; CHECK: stw r0, sp[2]
4545 ; CHECK: ldaw r1, sp[2]