llvm.org GIT mirror llvm / 78049a4
[X86][AVX] createVariablePermute - use 2xVPERMIL+PCMPGT+SELECT for v8i32/v8f32 and v4i64/v4f64 variable permutes As VPERMILPS/VPERMILPD only selects elements based on the bits[1:0]/bit[1] then we can permute both the (repeated) lo/hi 128-bit vectors in each case and then select between these results based on whether the index was for for lo/hi. For v4i64/v4f64 this avoids some rather nasty v4i64 multiples on the AVX2 implementation, which seems to be worse than the extra port5 pressure from the additional shuffles/blends. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327239 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
2 changed file(s) with 158 addition(s) and 251 deletion(s). Raw diff Collapse all Expand all
80348034 case MVT::v8i32:
80358035 if (Subtarget.hasAVX2())
80368036 Opcode = X86ISD::VPERMV;
8037 else if (Subtarget.hasXOP()) {
8037 else if (Subtarget.hasAVX()) {
80388038 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
80398039 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
80408040 {0, 1, 2, 3, 0, 1, 2, 3});
80418041 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
80428042 {4, 5, 6, 7, 4, 5, 6, 7});
8043 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8044 LoLo, HiHi, IndicesVec,
8045 DAG.getConstant(0, DL, MVT::i8)));
8043 if (Subtarget.hasXOP())
8044 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8045 LoLo, HiHi, IndicesVec,
8046 DAG.getConstant(0, DL, MVT::i8)));
8047 // Permute Lo and Hi and then select based on index range.
8048 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8049 SDValue Res = DAG.getSelectCC(
8050 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8051 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8052 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8053 ISD::CondCode::SETGT);
8054 return DAG.getBitcast(VT, Res);
80468055 }
80478056 break;
80488057 case MVT::v4i64:
80598068 return extract256BitVector(Res, 0, DAG, DL);
80608069 }
80618070 Opcode = X86ISD::VPERMV;
8062 } else if (Subtarget.hasXOP()) {
8071 } else if (Subtarget.hasAVX()) {
80638072 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
80648073 SDValue LoLo =
80658074 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
80678076 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
80688077 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
80698078 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8070 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8071 LoLo, HiHi, IndicesVec,
8072 DAG.getConstant(0, DL, MVT::i8)));
8073 } else if (Subtarget.hasAVX2()) {
8074 Opcode = X86ISD::VPERMV;
8075 ShuffleVT = MVT::v8f32;
8079 if (Subtarget.hasXOP())
8080 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8081 LoLo, HiHi, IndicesVec,
8082 DAG.getConstant(0, DL, MVT::i8)));
8083 // Permute Lo and Hi and then select based on index range.
8084 // This works as VPERMILPD only uses index bit[1] to permute elements.
8085 SDValue Res = DAG.getSelectCC(
8086 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8087 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8088 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8089 ISD::CondCode::SETGT);
8090 return DAG.getBitcast(VT, Res);
80768091 }
80778092 break;
80788093 case MVT::v64i8:
2222 ;
2323 ; AVX1-LABEL: var_shuffle_v4i64:
2424 ; AVX1: # %bb.0:
25 ; AVX1-NEXT: pushq %rbp
26 ; AVX1-NEXT: movq %rsp, %rbp
27 ; AVX1-NEXT: andq $-32, %rsp
28 ; AVX1-NEXT: subq $64, %rsp
29 ; AVX1-NEXT: vmovq %xmm1, %rax
30 ; AVX1-NEXT: andl $3, %eax
31 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
32 ; AVX1-NEXT: andl $3, %ecx
33 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
34 ; AVX1-NEXT: vmovq %xmm1, %rdx
35 ; AVX1-NEXT: andl $3, %edx
36 ; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
37 ; AVX1-NEXT: andl $3, %esi
38 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
39 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
40 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
41 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
42 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
43 ; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
44 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
45 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
46 ; AVX1-NEXT: movq %rbp, %rsp
47 ; AVX1-NEXT: popq %rbp
25 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2]
26 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
27 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
28 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4
29 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
30 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
31 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
32 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
33 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
34 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
35 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
36 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
37 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
38 ; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
4839 ; AVX1-NEXT: retq
4940 ;
5041 ; AVX2-LABEL: var_shuffle_v4i64:
5142 ; AVX2: # %bb.0:
43 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
5244 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
53 ; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2
54 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
55 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
56 ; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
57 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
58 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
59 ; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
60 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
61 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
62 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
63 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
45 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
46 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
47 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
48 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
49 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
50 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
6451 ; AVX2-NEXT: retq
6552 ;
6653 ; AVX512F-LABEL: var_shuffle_v4i64:
113100 ;
114101 ; AVX1-LABEL: var_shuffle_v8i32:
115102 ; AVX1: # %bb.0:
116 ; AVX1-NEXT: pushq %rbp
117 ; AVX1-NEXT: movq %rsp, %rbp
118 ; AVX1-NEXT: andq $-32, %rsp
119 ; AVX1-NEXT: subq $64, %rsp
120 ; AVX1-NEXT: vmovd %xmm1, %r8d
121 ; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
122 ; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
123 ; AVX1-NEXT: vpextrd $3, %xmm1, %esi
124 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
125 ; AVX1-NEXT: vmovd %xmm1, %edi
126 ; AVX1-NEXT: vpextrd $1, %xmm1, %eax
127 ; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
128 ; AVX1-NEXT: vpextrd $3, %xmm1, %edx
129 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
130 ; AVX1-NEXT: andl $7, %r8d
131 ; AVX1-NEXT: andl $7, %r9d
132 ; AVX1-NEXT: andl $7, %r10d
133 ; AVX1-NEXT: andl $7, %esi
134 ; AVX1-NEXT: andl $7, %edi
135 ; AVX1-NEXT: andl $7, %eax
136 ; AVX1-NEXT: andl $7, %ecx
137 ; AVX1-NEXT: andl $7, %edx
138 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
139 ; AVX1-NEXT: vpinsrd $1, (%rsp,%rax,4), %xmm0, %xmm0
140 ; AVX1-NEXT: vpinsrd $2, (%rsp,%rcx,4), %xmm0, %xmm0
141 ; AVX1-NEXT: vpinsrd $3, (%rsp,%rdx,4), %xmm0, %xmm0
142 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
143 ; AVX1-NEXT: vpinsrd $1, (%rsp,%r9,4), %xmm1, %xmm1
144 ; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm1, %xmm1
145 ; AVX1-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm1, %xmm1
146 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
147 ; AVX1-NEXT: movq %rbp, %rsp
148 ; AVX1-NEXT: popq %rbp
103 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
104 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
105 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
106 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
107 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
108 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
109 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
110 ; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3
111 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
112 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
113 ; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
149114 ; AVX1-NEXT: retq
150115 ;
151116 ; INT256-LABEL: var_shuffle_v8i32:
13251290 ;
13261291 ; AVX1-LABEL: var_shuffle_v4f64:
13271292 ; AVX1: # %bb.0:
1328 ; AVX1-NEXT: pushq %rbp
1329 ; AVX1-NEXT: movq %rsp, %rbp
1330 ; AVX1-NEXT: andq $-32, %rsp
1331 ; AVX1-NEXT: subq $64, %rsp
1332 ; AVX1-NEXT: vmovq %xmm1, %rax
1333 ; AVX1-NEXT: andl $3, %eax
1334 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
1335 ; AVX1-NEXT: andl $3, %ecx
1336 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1337 ; AVX1-NEXT: vmovq %xmm1, %rdx
1338 ; AVX1-NEXT: andl $3, %edx
1339 ; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
1340 ; AVX1-NEXT: andl $3, %esi
1341 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
1342 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1343 ; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
1344 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1345 ; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1346 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1347 ; AVX1-NEXT: movq %rbp, %rsp
1348 ; AVX1-NEXT: popq %rbp
1293 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2]
1294 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1295 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1296 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4
1297 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
1298 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1299 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
1300 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1301 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1302 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
1303 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
1304 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1305 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1306 ; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
13491307 ; AVX1-NEXT: retq
13501308 ;
13511309 ; AVX2-LABEL: var_shuffle_v4f64:
13521310 ; AVX2: # %bb.0:
1311 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
13531312 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1354 ; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2
1355 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
1356 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
1357 ; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
1358 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
1359 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
1360 ; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
1361 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
1362 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
1363 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
1364 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
1313 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1314 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
1315 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
1316 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1317 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1318 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
13651319 ; AVX2-NEXT: retq
13661320 ;
13671321 ; AVX512F-LABEL: var_shuffle_v4f64:
14141368 ;
14151369 ; AVX1-LABEL: var_shuffle_v8f32:
14161370 ; AVX1: # %bb.0:
1417 ; AVX1-NEXT: pushq %rbp
1418 ; AVX1-NEXT: movq %rsp, %rbp
1419 ; AVX1-NEXT: andq $-32, %rsp
1420 ; AVX1-NEXT: subq $64, %rsp
1421 ; AVX1-NEXT: vmovd %xmm1, %esi
1422 ; AVX1-NEXT: vpextrd $1, %xmm1, %r8d
1423 ; AVX1-NEXT: vpextrd $2, %xmm1, %r9d
1424 ; AVX1-NEXT: vpextrd $3, %xmm1, %r10d
1425 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1426 ; AVX1-NEXT: vmovd %xmm1, %edx
1427 ; AVX1-NEXT: vpextrd $1, %xmm1, %edi
1428 ; AVX1-NEXT: vpextrd $2, %xmm1, %eax
1429 ; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
1430 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
1431 ; AVX1-NEXT: andl $7, %esi
1432 ; AVX1-NEXT: andl $7, %r8d
1433 ; AVX1-NEXT: andl $7, %r9d
1434 ; AVX1-NEXT: andl $7, %r10d
1435 ; AVX1-NEXT: andl $7, %edx
1436 ; AVX1-NEXT: andl $7, %edi
1437 ; AVX1-NEXT: andl $7, %eax
1438 ; AVX1-NEXT: andl $7, %ecx
1439 ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1440 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
1441 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
1442 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
1443 ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1444 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
1445 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
1446 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
1447 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1448 ; AVX1-NEXT: movq %rbp, %rsp
1449 ; AVX1-NEXT: popq %rbp
1371 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
1372 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1373 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1374 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
1375 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
1376 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1377 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1378 ; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3
1379 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1380 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1381 ; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
14501382 ; AVX1-NEXT: retq
14511383 ;
14521384 ; INT256-LABEL: var_shuffle_v8f32:
14991431 ;
15001432 ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64:
15011433 ; AVX1: # %bb.0:
1502 ; AVX1-NEXT: vmovq %xmm1, %rax
1503 ; AVX1-NEXT: andl $1, %eax
1504 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
1505 ; AVX1-NEXT: andl $1, %ecx
1506 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1507 ; AVX1-NEXT: vmovq %xmm1, %rdx
1508 ; AVX1-NEXT: andl $1, %edx
1509 ; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
1510 ; AVX1-NEXT: andl $1, %esi
1511 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1512 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1513 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1514 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1515 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1516 ; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
1517 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1518 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1434 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1435 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2]
1436 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1437 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1438 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4
1439 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
1440 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1441 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
1442 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1443 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1444 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
1445 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
1446 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1447 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1448 ; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
15191449 ; AVX1-NEXT: retq
15201450 ;
15211451 ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64:
15221452 ; AVX2: # %bb.0:
15231453 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1454 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
15241455 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
1525 ; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2
1526 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
1527 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
1528 ; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
1529 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
1530 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
1531 ; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
1532 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
1533 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
1534 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
1535 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
1456 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
1457 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
1458 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
1459 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
1460 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
1461 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
15361462 ; AVX2-NEXT: retq
15371463 ;
15381464 ; AVX512F-LABEL: var_shuffle_v4i64_from_v2i64:
15881514 ;
15891515 ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
15901516 ; AVX1: # %bb.0: # %entry
1591 ; AVX1-NEXT: vmovd %xmm1, %r8d
1592 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1593 ; AVX1-NEXT: andl $3, %r8d
1594 ; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
1595 ; AVX1-NEXT: andl $3, %r9d
1596 ; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
1597 ; AVX1-NEXT: andl $3, %r10d
1598 ; AVX1-NEXT: vpextrd $3, %xmm1, %esi
1599 ; AVX1-NEXT: andl $3, %esi
1600 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
1601 ; AVX1-NEXT: vmovd %xmm0, %edi
1602 ; AVX1-NEXT: andl $3, %edi
1603 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
1604 ; AVX1-NEXT: andl $3, %eax
1605 ; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
1606 ; AVX1-NEXT: andl $3, %ecx
1607 ; AVX1-NEXT: vpextrd $3, %xmm0, %edx
1608 ; AVX1-NEXT: andl $3, %edx
1609 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1610 ; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rax,4), %xmm0, %xmm0
1611 ; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rcx,4), %xmm0, %xmm0
1612 ; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdx,4), %xmm0, %xmm0
1613 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1614 ; AVX1-NEXT: vpinsrd $1, -24(%rsp,%r9,4), %xmm1, %xmm1
1615 ; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r10,4), %xmm1, %xmm1
1616 ; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm1, %xmm1
1617 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1517 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1518 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
1519 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
1520 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1521 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
1522 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
1523 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
1524 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
1525 ; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3
1526 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1527 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
1528 ; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
16181529 ; AVX1-NEXT: retq
16191530 ;
16201531 ; INT256-LABEL: var_shuffle_v8i32_from_v4i32:
27372648 ;
27382649 ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64:
27392650 ; AVX1: # %bb.0:
2740 ; AVX1-NEXT: vmovq %xmm1, %rax
2741 ; AVX1-NEXT: andl $1, %eax
2742 ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
2743 ; AVX1-NEXT: andl $1, %ecx
2744 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2745 ; AVX1-NEXT: vmovq %xmm1, %rdx
2746 ; AVX1-NEXT: andl $1, %edx
2747 ; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
2748 ; AVX1-NEXT: andl $1, %esi
2749 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
2750 ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2751 ; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2752 ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2753 ; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
2754 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2651 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2652 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2]
2653 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
2654 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
2655 ; AVX1-NEXT: vpaddq %xmm4, %xmm4, %xmm4
2656 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
2657 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
2658 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
2659 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2660 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
2661 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
2662 ; AVX1-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
2663 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2664 ; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
2665 ; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
27552666 ; AVX1-NEXT: retq
27562667 ;
27572668 ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64:
27582669 ; AVX2: # %bb.0:
27592670 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2671 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1
27602672 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2]
2761 ; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2
2762 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
2763 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8589934594,8589934594,8589934594,8589934594]
2764 ; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
2765 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
2766 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
2767 ; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
2768 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [4294967296,4294967296,4294967296,4294967296]
2769 ; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
2770 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
2771 ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
2673 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
2674 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3]
2675 ; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3
2676 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
2677 ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
2678 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
27722679 ; AVX2-NEXT: retq
27732680 ;
27742681 ; AVX512F-LABEL: var_shuffle_v4f64_from_v2f64:
28242731 ;
28252732 ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
28262733 ; AVX1: # %bb.0: # %entry
2827 ; AVX1-NEXT: vmovd %xmm1, %r8d
2828 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
2829 ; AVX1-NEXT: andl $3, %r8d
2830 ; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
2831 ; AVX1-NEXT: andl $3, %r9d
2832 ; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
2833 ; AVX1-NEXT: andl $3, %r10d
2834 ; AVX1-NEXT: vpextrd $3, %xmm1, %esi
2835 ; AVX1-NEXT: andl $3, %esi
2836 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
2837 ; AVX1-NEXT: vmovd %xmm0, %edi
2838 ; AVX1-NEXT: andl $3, %edi
2839 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
2840 ; AVX1-NEXT: andl $3, %eax
2841 ; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
2842 ; AVX1-NEXT: andl $3, %ecx
2843 ; AVX1-NEXT: vpextrd $3, %xmm0, %edx
2844 ; AVX1-NEXT: andl $3, %edx
2845 ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2846 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
2847 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
2848 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
2849 ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2850 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
2851 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
2852 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
2853 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2734 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2735 ; AVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
2736 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
2737 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
2738 ; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
2739 ; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
2740 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2741 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
2742 ; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3
2743 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2744 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0
2745 ; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
28542746 ; AVX1-NEXT: retq
28552747 ;
28562748 ; INT256-LABEL: var_shuffle_v8f32_from_v4f32: