llvm.org GIT mirror llvm / b419ca4
[X86][SSE41] Avoid variable blend for constant v8i16 shifts The SSE41 v8i16 shift lowering using (v)pblendvb is great for non-constant shift amounts, but if it is constant then we can efficiently reduce the VSELECT to shuffles with the pre-SSE41 lowering. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@263383 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 years ago
7 changed file(s) with 66 addition(s) and 131 deletion(s). Raw diff Collapse all Expand all
1978419784 if (VT == MVT::v8i16) {
1978519785 unsigned ShiftOpcode = Op->getOpcode();
1978619786
19787 // If we have a constant shift amount, the non-SSE41 path is best as
19788 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
19789 bool UseSSE41 = Subtarget.hasSSE41() &&
19790 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
19791
1978719792 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
1978819793 // On SSE41 targets we make use of the fact that VSELECT lowers
1978919794 // to PBLENDVB which selects bytes based just on the sign bit.
19790 if (Subtarget.hasSSE41()) {
19795 if (UseSSE41) {
1979119796 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
1979219797 V0 = DAG.getBitcast(ExtVT, V0);
1979319798 V1 = DAG.getBitcast(ExtVT, V1);
1980419809 };
1980519810
1980619811 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
19807 if (Subtarget.hasSSE41()) {
19812 if (UseSSE41) {
1980819813 // On SSE41 targets we need to replicate the shift mask in both
1980919814 // bytes for PBLENDVB.
1981019815 Amt = DAG.getNode(
954954 ;
955955 ; SSE41-LABEL: constant_rotate_v8i16:
956956 ; SSE41: # BB#0:
957 ; SSE41-NEXT: movdqa %xmm0, %xmm1
958 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
959 ; SSE41-NEXT: pmullw %xmm1, %xmm2
960 ; SSE41-NEXT: movdqa %xmm1, %xmm3
961 ; SSE41-NEXT: psrlw $8, %xmm3
962 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [256,61680,57568,53456,49344,45232,41120,37008]
963 ; SSE41-NEXT: pblendvb %xmm3, %xmm1
964 ; SSE41-NEXT: movdqa %xmm1, %xmm3
965 ; SSE41-NEXT: psrlw $4, %xmm3
966 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [512,57824,49600,41376,33152,24928,16704,8480]
967 ; SSE41-NEXT: pblendvb %xmm3, %xmm1
968 ; SSE41-NEXT: movdqa %xmm1, %xmm3
969 ; SSE41-NEXT: psrlw $2, %xmm3
970 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [1024,50112,33664,17216,768,49856,33408,16960]
971 ; SSE41-NEXT: pblendvb %xmm3, %xmm1
972 ; SSE41-NEXT: movdqa %xmm1, %xmm3
973 ; SSE41-NEXT: psrlw $1, %xmm3
974 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [2048,34688,1792,34432,1536,34176,1280,33920]
975 ; SSE41-NEXT: pblendvb %xmm3, %xmm1
976 ; SSE41-NEXT: por %xmm2, %xmm1
977 ; SSE41-NEXT: movdqa %xmm1, %xmm0
957 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
958 ; SSE41-NEXT: pmullw %xmm0, %xmm1
959 ; SSE41-NEXT: movdqa %xmm0, %xmm2
960 ; SSE41-NEXT: psrlw $8, %xmm2
961 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
962 ; SSE41-NEXT: movdqa %xmm2, %xmm0
963 ; SSE41-NEXT: psrlw $4, %xmm0
964 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4],xmm2[5,6,7]
965 ; SSE41-NEXT: movdqa %xmm0, %xmm2
966 ; SSE41-NEXT: psrlw $2, %xmm2
967 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
968 ; SSE41-NEXT: movdqa %xmm2, %xmm0
969 ; SSE41-NEXT: psrlw $1, %xmm0
970 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
971 ; SSE41-NEXT: por %xmm1, %xmm0
978972 ; SSE41-NEXT: retq
979973 ;
980974 ; AVX1-LABEL: constant_rotate_v8i16:
981975 ; AVX1: # BB#0:
982976 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
983977 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
984 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,61680,57568,53456,49344,45232,41120,37008]
985 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
978 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
986979 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
987 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,57824,49600,41376,33152,24928,16704,8480]
988 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
980 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4],xmm0[5,6,7]
989981 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
990 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,50112,33664,17216,768,49856,33408,16960]
991 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
982 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
992983 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
993 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2048,34688,1792,34432,1536,34176,1280,33920]
994 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
984 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
995985 ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
996986 ; AVX1-NEXT: retq
997987 ;
505505 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3
506506 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
507507 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm3
508 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [32896,28784,24672,20560,16448,12336,8224,4112]
509 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
508 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
510509 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
511 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,57568,49344,41120,32896,24672,16448,8224]
512 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
510 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3,4],xmm2[5,6,7]
513511 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3
514 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,49600,33152,16704,256,49344,32896,16448]
515 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
512 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4],xmm3[5,6],xmm2[7]
516513 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3
517 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,33664,768,33408,512,33152,256,32896]
518 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
514 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
519515 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
520 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,61680,57568,53456,49344,45232,41120,37008]
521 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
516 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3,4,5,6,7]
522517 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3
523 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [512,57824,49600,41376,33152,24928,16704,8480]
524 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
518 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3,4],xmm0[5,6,7]
525519 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3
526 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1024,50112,33664,17216,768,49856,33408,16960]
527 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
520 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3,4],xmm3[5,6],xmm0[7]
528521 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3
529 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2048,34688,1792,34432,1536,34176,1280,33920]
530 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
522 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
531523 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
532524 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
533525 ; AVX1-NEXT: retq
11911191 ; SSE41-LABEL: constant_shift_v8i16:
11921192 ; SSE41: # BB#0:
11931193 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1194 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1195 ; SSE41-NEXT: psraw $8, %xmm2
1196 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
1197 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
1198 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1199 ; SSE41-NEXT: psraw $4, %xmm2
1200 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
1201 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
1194 ; SSE41-NEXT: psraw $4, %xmm1
1195 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
12021196 ; SSE41-NEXT: movdqa %xmm1, %xmm2
12031197 ; SSE41-NEXT: psraw $2, %xmm2
1204 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
1205 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
1206 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1207 ; SSE41-NEXT: psraw $1, %xmm2
1208 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
1209 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
1210 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1198 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1199 ; SSE41-NEXT: movdqa %xmm2, %xmm0
1200 ; SSE41-NEXT: psraw $1, %xmm0
1201 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
12111202 ; SSE41-NEXT: retq
12121203 ;
12131204 ; AVX1-LABEL: constant_shift_v8i16:
12141205 ; AVX1: # BB#0:
1215 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
1216 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
1217 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
12181206 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
1219 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
1220 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1207 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
12211208 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
1222 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
1223 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1209 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
12241210 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
1225 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
1226 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1211 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
12271212 ; AVX1-NEXT: retq
12281213 ;
12291214 ; AVX2-LABEL: constant_shift_v8i16:
762762 ; AVX1-LABEL: constant_shift_v16i16:
763763 ; AVX1: # BB#0:
764764 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
765 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm2
766 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
767 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
765 ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
768766 ; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
769 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
770 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
767 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
771768 ; AVX1-NEXT: vpsraw $2, %xmm1, %xmm2
772 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
773 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
769 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
774770 ; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2
775 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
776 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
777 ; AVX1-NEXT: vpsraw $8, %xmm0, %xmm2
778 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
779 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
771 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
780772 ; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2
781 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
782 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
773 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
783774 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2
784 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
785 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
775 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
786776 ; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2
787 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
788 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
777 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
789778 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
790779 ; AVX1-NEXT: retq
791780 ;
925925 ; SSE41-LABEL: constant_shift_v8i16:
926926 ; SSE41: # BB#0:
927927 ; SSE41-NEXT: movdqa %xmm0, %xmm1
928 ; SSE41-NEXT: movdqa %xmm1, %xmm2
929 ; SSE41-NEXT: psrlw $8, %xmm2
930 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,4112,8224,12336,16448,20560,24672,28784]
931 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
932 ; SSE41-NEXT: movdqa %xmm1, %xmm2
933 ; SSE41-NEXT: psrlw $4, %xmm2
934 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,8224,16448,24672,32896,41120,49344,57568]
935 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
928 ; SSE41-NEXT: psrlw $4, %xmm1
929 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
936930 ; SSE41-NEXT: movdqa %xmm1, %xmm2
937931 ; SSE41-NEXT: psrlw $2, %xmm2
938 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,16448,32896,49344,256,16704,33152,49600]
939 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
940 ; SSE41-NEXT: movdqa %xmm1, %xmm2
941 ; SSE41-NEXT: psrlw $1, %xmm2
942 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,32896,256,33152,512,33408,768,33664]
943 ; SSE41-NEXT: pblendvb %xmm2, %xmm1
944 ; SSE41-NEXT: movdqa %xmm1, %xmm0
932 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
933 ; SSE41-NEXT: movdqa %xmm2, %xmm0
934 ; SSE41-NEXT: psrlw $1, %xmm0
935 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
945936 ; SSE41-NEXT: retq
946937 ;
947938 ; AVX1-LABEL: constant_shift_v8i16:
948939 ; AVX1: # BB#0:
949 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
950 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4112,8224,12336,16448,20560,24672,28784]
951 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
952940 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
953 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,8224,16448,24672,32896,41120,49344,57568]
954 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
941 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
955942 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
956 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16448,32896,49344,256,16704,33152,49600]
957 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
943 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
958944 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
959 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,32896,256,33152,512,33408,768,33664]
960 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
945 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
961946 ; AVX1-NEXT: retq
962947 ;
963948 ; AVX2-LABEL: constant_shift_v8i16:
617617 ; AVX1-LABEL: constant_shift_v16i16:
618618 ; AVX1: # BB#0:
619619 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
620 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm2
621 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32896,37008,41120,45232,49344,53456,57568,61680]
622 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
620 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
623621 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
624 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,8480,16704,24928,33152,41376,49600,57824]
625 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
622 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
626623 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
627 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [512,16960,33408,49856,768,17216,33664,50112]
628 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
624 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
629625 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2
630 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1024,33920,1280,34176,1536,34432,1792,34688]
631 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1
632 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
633 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,4112,8224,12336,16448,20560,24672,28784]
634 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
626 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
635627 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2
636 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,8224,16448,24672,32896,41120,49344,57568]
637 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
628 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
638629 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2
639 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,16448,32896,49344,256,16704,33152,49600]
640 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
630 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
641631 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2
642 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,32896,256,33152,512,33408,768,33664]
643 ; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
632 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
644633 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
645634 ; AVX1-NEXT: retq
646635 ;