llvm.org GIT mirror llvm / be2cd40
[X86][SSE] Propagate undef upper elements from scalar_to_vector during shuffle combining Only do this for integer types currently - floats types (in particular insertps) load folding often fails with this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295208 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 years ago
8 changed file(s) with 37 addition(s) and 47 deletion(s). Raw diff Collapse all Expand all
56175617 }
56185618
56195619 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5620 // TODO: We currently only set UNDEF for integer types - floats use the same
5621 // registers as vectors and many of the scalar folded loads rely on the
5622 // SCALAR_TO_VECTOR pattern.
56205623 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
56215624 (Size % V.getValueType().getVectorNumElements()) == 0) {
56225625 int Scale = Size / V.getValueType().getVectorNumElements();
5623 if (((M / Scale) == 0) && X86::isZeroNode(V.getOperand(0)))
5626 int Idx = M / Scale;
5627 if (Idx != 0 && !VT.isFloatingPoint())
5628 Mask[i] = SM_SentinelUndef;
5629 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
56245630 Mask[i] = SM_SentinelZero;
56255631 continue;
56265632 }
448448 ; CHECK: # BB#0: # %entry
449449 ; CHECK-NEXT: movq {{.*}}(%rip), %rax
450450 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
451 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
451 ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
452452 ; CHECK-NEXT: psrad $16, %xmm0
453453 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
454454 ; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
834834 ; CHECK: # BB#0: # %entry
835835 ; CHECK-NEXT: movq {{.*}}(%rip), %rax
836836 ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
837 ; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
837 ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
838838 ; CHECK-NEXT: psrad $16, %xmm0
839839 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
840840 ; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
149149 ; X32-NEXT: subl $8, %esp
150150 ; X32-NEXT: movq %mm0, (%esp)
151151 ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
152 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,0,1]
152 ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
153153 ; X32-NEXT: movd %xmm0, %eax
154154 ; X32-NEXT: movl %ebp, %esp
155155 ; X32-NEXT: popl %ebp
26602660 ; SSE-LABEL: sitofp_load_2i16_to_2f64:
26612661 ; SSE: # BB#0:
26622662 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2663 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2663 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
26642664 ; SSE-NEXT: psrad $16, %xmm0
26652665 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
26662666 ; SSE-NEXT: retq
28212821 ; AVX1-NEXT: shlq $32, %rdx
28222822 ; AVX1-NEXT: orq %rcx, %rdx
28232823 ; AVX1-NEXT: vmovq %rdx, %xmm0
2824 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2824 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
28252825 ; AVX1-NEXT: retq
28262826 ;
28272827 ; AVX2-LABEL: cvt_4f32_to_8i16_undef:
28462846 ; AVX2-NEXT: shlq $32, %rdx
28472847 ; AVX2-NEXT: orq %rcx, %rdx
28482848 ; AVX2-NEXT: vmovq %rdx, %xmm0
2849 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2849 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
28502850 ; AVX2-NEXT: retq
28512851 ;
28522852 ; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
28722872 ; AVX512F-NEXT: shlq $32, %rdx
28732873 ; AVX512F-NEXT: orq %rcx, %rdx
28742874 ; AVX512F-NEXT: vmovq %rdx, %xmm0
2875 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2875 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
28762876 ; AVX512F-NEXT: retq
28772877 ;
28782878 ; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
28982898 ; AVX512VL-NEXT: orq %rcx, %rdx
28992899 ; AVX512VL-NEXT: vmovq %rdx, %xmm0
29002900 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2901 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
29022901 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
29032902 ; AVX512VL-NEXT: retq
29042903 %1 = fptrunc <4 x float> %a0 to <4 x half>
29302929 ; AVX1-NEXT: shlq $32, %rdx
29312930 ; AVX1-NEXT: orq %rcx, %rdx
29322931 ; AVX1-NEXT: vmovq %rdx, %xmm0
2933 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2932 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
29342933 ; AVX1-NEXT: retq
29352934 ;
29362935 ; AVX2-LABEL: cvt_4f32_to_8i16_zero:
29552954 ; AVX2-NEXT: shlq $32, %rdx
29562955 ; AVX2-NEXT: orq %rcx, %rdx
29572956 ; AVX2-NEXT: vmovq %rdx, %xmm0
2958 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2957 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
29592958 ; AVX2-NEXT: retq
29602959 ;
29612960 ; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
29812980 ; AVX512F-NEXT: shlq $32, %rdx
29822981 ; AVX512F-NEXT: orq %rcx, %rdx
29832982 ; AVX512F-NEXT: vmovq %rdx, %xmm0
2984 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
2983 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
29852984 ; AVX512F-NEXT: retq
29862985 ;
29872986 ; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
30073006 ; AVX512VL-NEXT: orq %rcx, %rdx
30083007 ; AVX512VL-NEXT: vmovq %rdx, %xmm0
30093008 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3010 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
30113009 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
30123010 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
30133011 ; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
36303628 ; AVX1-NEXT: shlq $32, %rdx
36313629 ; AVX1-NEXT: orq %rcx, %rdx
36323630 ; AVX1-NEXT: vmovq %rdx, %xmm0
3633 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3631 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
36343632 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
36353633 ; AVX1-NEXT: retq
36363634 ;
36563654 ; AVX2-NEXT: shlq $32, %rdx
36573655 ; AVX2-NEXT: orq %rcx, %rdx
36583656 ; AVX2-NEXT: vmovq %rdx, %xmm0
3659 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3657 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
36603658 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
36613659 ; AVX2-NEXT: retq
36623660 ;
36833681 ; AVX512F-NEXT: shlq $32, %rdx
36843682 ; AVX512F-NEXT: orq %rcx, %rdx
36853683 ; AVX512F-NEXT: vmovq %rdx, %xmm0
3686 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
3684 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
36873685 ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
36883686 ; AVX512F-NEXT: retq
36893687 ;
37103708 ; AVX512VL-NEXT: orq %rcx, %rdx
37113709 ; AVX512VL-NEXT: vmovq %rdx, %xmm0
37123710 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3713 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
37143711 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
37153712 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi)
37163713 ; AVX512VL-NEXT: retq
37443741 ; AVX1-NEXT: shlq $32, %rdx
37453742 ; AVX1-NEXT: orq %rcx, %rdx
37463743 ; AVX1-NEXT: vmovq %rdx, %xmm0
3747 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3744 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
37483745 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
37493746 ; AVX1-NEXT: retq
37503747 ;
37703767 ; AVX2-NEXT: shlq $32, %rdx
37713768 ; AVX2-NEXT: orq %rcx, %rdx
37723769 ; AVX2-NEXT: vmovq %rdx, %xmm0
3773 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3770 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
37743771 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
37753772 ; AVX2-NEXT: retq
37763773 ;
37973794 ; AVX512F-NEXT: shlq $32, %rdx
37983795 ; AVX512F-NEXT: orq %rcx, %rdx
37993796 ; AVX512F-NEXT: vmovq %rdx, %xmm0
3800 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
3797 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
38013798 ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
38023799 ; AVX512F-NEXT: retq
38033800 ;
38243821 ; AVX512VL-NEXT: orq %rcx, %rdx
38253822 ; AVX512VL-NEXT: vmovq %rdx, %xmm0
38263823 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3827 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
38283824 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
38293825 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
38303826 ; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
44764472 ; AVX1-NEXT: shlq $32, %rax
44774473 ; AVX1-NEXT: orq %r14, %rax
44784474 ; AVX1-NEXT: vmovq %rax, %xmm0
4479 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4475 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
44804476 ; AVX1-NEXT: addq $40, %rsp
44814477 ; AVX1-NEXT: popq %rbx
44824478 ; AVX1-NEXT: popq %r14
45144510 ; AVX2-NEXT: shlq $32, %rax
45154511 ; AVX2-NEXT: orq %r14, %rax
45164512 ; AVX2-NEXT: vmovq %rax, %xmm0
4517 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4513 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
45184514 ; AVX2-NEXT: addq $40, %rsp
45194515 ; AVX2-NEXT: popq %rbx
45204516 ; AVX2-NEXT: popq %r14
45494545 ; AVX512F-NEXT: shlq $32, %rax
45504546 ; AVX512F-NEXT: orq %r14, %rax
45514547 ; AVX512F-NEXT: vmovq %rax, %xmm0
4552 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
4548 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
45534549 ; AVX512F-NEXT: addq $40, %rsp
45544550 ; AVX512F-NEXT: popq %rbx
45554551 ; AVX512F-NEXT: popq %r14
45854581 ; AVX512VL-NEXT: orq %r14, %rax
45864582 ; AVX512VL-NEXT: vmovq %rax, %xmm0
45874583 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4588 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
45894584 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
45904585 ; AVX512VL-NEXT: addq $40, %rsp
45914586 ; AVX512VL-NEXT: popq %rbx
46304625 ; AVX1-NEXT: shlq $32, %rax
46314626 ; AVX1-NEXT: orq %r14, %rax
46324627 ; AVX1-NEXT: vmovq %rax, %xmm0
4633 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
4628 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
46344629 ; AVX1-NEXT: addq $40, %rsp
46354630 ; AVX1-NEXT: popq %rbx
46364631 ; AVX1-NEXT: popq %r14
46684663 ; AVX2-NEXT: shlq $32, %rax
46694664 ; AVX2-NEXT: orq %r14, %rax
46704665 ; AVX2-NEXT: vmovq %rax, %xmm0
4671 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
4666 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
46724667 ; AVX2-NEXT: addq $40, %rsp
46734668 ; AVX2-NEXT: popq %rbx
46744669 ; AVX2-NEXT: popq %r14
47034698 ; AVX512F-NEXT: shlq $32, %rax
47044699 ; AVX512F-NEXT: orq %r14, %rax
47054700 ; AVX512F-NEXT: vmovq %rax, %xmm0
4706 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
4701 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
47074702 ; AVX512F-NEXT: addq $40, %rsp
47084703 ; AVX512F-NEXT: popq %rbx
47094704 ; AVX512F-NEXT: popq %r14
47394734 ; AVX512VL-NEXT: orq %r14, %rax
47404735 ; AVX512VL-NEXT: vmovq %rax, %xmm0
47414736 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4742 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
47434737 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
47444738 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
47454739 ; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
52495243 ; AVX1-NEXT: shlq $32, %rax
52505244 ; AVX1-NEXT: orq %rbx, %rax
52515245 ; AVX1-NEXT: vmovq %rax, %xmm0
5252 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
5246 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
52535247 ; AVX1-NEXT: vmovdqa %xmm0, (%r14)
52545248 ; AVX1-NEXT: addq $32, %rsp
52555249 ; AVX1-NEXT: popq %rbx
52915285 ; AVX2-NEXT: shlq $32, %rax
52925286 ; AVX2-NEXT: orq %rbx, %rax
52935287 ; AVX2-NEXT: vmovq %rax, %xmm0
5294 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
5288 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
52955289 ; AVX2-NEXT: vmovdqa %xmm0, (%r14)
52965290 ; AVX2-NEXT: addq $32, %rsp
52975291 ; AVX2-NEXT: popq %rbx
53305324 ; AVX512F-NEXT: shlq $32, %rax
53315325 ; AVX512F-NEXT: orq %rbx, %rax
53325326 ; AVX512F-NEXT: vmovq %rax, %xmm0
5333 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
5327 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
53345328 ; AVX512F-NEXT: vmovdqa %xmm0, (%r14)
53355329 ; AVX512F-NEXT: addq $32, %rsp
53365330 ; AVX512F-NEXT: popq %rbx
53705364 ; AVX512VL-NEXT: orq %rbx, %rax
53715365 ; AVX512VL-NEXT: vmovq %rax, %xmm0
53725366 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
5373 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
53745367 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
53755368 ; AVX512VL-NEXT: vmovdqa %xmm0, (%r14)
53765369 ; AVX512VL-NEXT: addq $32, %rsp
54205413 ; AVX1-NEXT: shlq $32, %rax
54215414 ; AVX1-NEXT: orq %rbx, %rax
54225415 ; AVX1-NEXT: vmovq %rax, %xmm0
5423 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
5416 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
54245417 ; AVX1-NEXT: vmovdqa %xmm0, (%r14)
54255418 ; AVX1-NEXT: addq $32, %rsp
54265419 ; AVX1-NEXT: popq %rbx
54625455 ; AVX2-NEXT: shlq $32, %rax
54635456 ; AVX2-NEXT: orq %rbx, %rax
54645457 ; AVX2-NEXT: vmovq %rax, %xmm0
5465 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
5458 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
54665459 ; AVX2-NEXT: vmovdqa %xmm0, (%r14)
54675460 ; AVX2-NEXT: addq $32, %rsp
54685461 ; AVX2-NEXT: popq %rbx
55015494 ; AVX512F-NEXT: shlq $32, %rax
55025495 ; AVX512F-NEXT: orq %rbx, %rax
55035496 ; AVX512F-NEXT: vmovq %rax, %xmm0
5504 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
5497 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
55055498 ; AVX512F-NEXT: vmovdqa %xmm0, (%r14)
55065499 ; AVX512F-NEXT: addq $32, %rsp
55075500 ; AVX512F-NEXT: popq %rbx
55415534 ; AVX512VL-NEXT: orq %rbx, %rax
55425535 ; AVX512VL-NEXT: vmovq %rax, %xmm0
55435536 ; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
5544 ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
55455537 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
55465538 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
55475539 ; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
44344434 ; SSE2-LABEL: load_sext_2i16_to_2i64:
44354435 ; SSE2: # BB#0: # %entry
44364436 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4437 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4437 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
44384438 ; SSE2-NEXT: movdqa %xmm0, %xmm1
44394439 ; SSE2-NEXT: psrad $31, %xmm1
44404440 ; SSE2-NEXT: psrad $16, %xmm0
44444444 ; SSSE3-LABEL: load_sext_2i16_to_2i64:
44454445 ; SSSE3: # BB#0: # %entry
44464446 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
4447 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
4447 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
44484448 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
44494449 ; SSSE3-NEXT: psrad $31, %xmm1
44504450 ; SSSE3-NEXT: psrad $16, %xmm0
105105 ; X64-SSE2: # BB#0: # %entry
106106 ; X64-SSE2-NEXT: movzwl (%rsi), %eax
107107 ; X64-SSE2-NEXT: movd %rax, %xmm0
108 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
109 ; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
110108 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
111109 ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
112110 ; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
131129 ; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
132130 ; X64-SSE42-NEXT: movzwl (%rsi), %ecx
133131 ; X64-SSE42-NEXT: movd %rcx, %xmm0
134 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
135 ; X64-SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
136132 ; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
137133 ; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
138134 ; X64-SSE42-NEXT: pslld $24, %xmm0
130130 ; X64-SSE2: # BB#0: # %entry
131131 ; X64-SSE2-NEXT: movzwl (%rsi), %eax
132132 ; X64-SSE2-NEXT: movd %rax, %xmm0
133 ; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
134 ; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
135133 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
136134 ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
137135 ; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
156154 ; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
157155 ; X64-SSE42-NEXT: movzwl (%rsi), %ecx
158156 ; X64-SSE42-NEXT: movd %rcx, %xmm0
159 ; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
160 ; X64-SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
161157 ; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
162158 ; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0
163159 ; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0