llvm.org GIT mirror llvm / 75aa1cd
[X86][SSE] Use pblendw for v4i32/v2i64 during isel. Summary: Previously we used BLENDPS/BLENDPD but that puts the blend in the FP domain. Under optsize, the two address instruction pass can cause blendps/blendpd to commute to blendps/blendpd. But we probably shouldn't do that if the original type was a integer. So use pblendw instead. Reviewers: spatel, RKSimon Reviewed By: RKSimon Subscribers: jdoerfert, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D58574 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@354755 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 1 year, 1 month ago
10 changed file(s) with 142 addition(s) and 182 deletion(s). Raw diff Collapse all Expand all
63966396 return getI8Imm(Imm ^ 0xff, SDLoc(N));
63976397 }]>;
63986398
6399 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
6400 def BlendScaleImm4 : SDNodeXForm
6401 uint8_t Imm = N->getZExtValue();
6402 uint8_t NewImm = 0;
6403 for (unsigned i = 0; i != 4; ++i) {
6404 if (Imm & (1 << i))
6405 NewImm |= 0x3 << (i * 2);
6406 }
6407 return getI8Imm(NewImm, SDLoc(N));
6408 }]>;
6409
6410 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
6411 def BlendScaleImm2 : SDNodeXForm
6412 uint8_t Imm = N->getZExtValue();
6413 uint8_t NewImm = 0;
6414 for (unsigned i = 0; i != 2; ++i) {
6415 if (Imm & (1 << i))
6416 NewImm |= 0xf << (i * 4);
6417 }
6418 return getI8Imm(NewImm, SDLoc(N));
6419 }]>;
6420
6421 // Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
6422 def BlendScaleCommuteImm4 : SDNodeXForm
6423 uint8_t Imm = N->getZExtValue();
6424 uint8_t NewImm = 0;
6425 for (unsigned i = 0; i != 4; ++i) {
6426 if (Imm & (1 << i))
6427 NewImm |= 0x3 << (i * 2);
6428 }
6429 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6430 }]>;
6431
6432 // Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
6433 def BlendScaleCommuteImm2 : SDNodeXForm
6434 uint8_t Imm = N->getZExtValue();
6435 uint8_t NewImm = 0;
6436 for (unsigned i = 0; i != 2; ++i) {
6437 if (Imm & (1 << i))
6438 NewImm |= 0xf << (i * 4);
6439 }
6440 return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6441 }]>;
6442
63996443 let Predicates = [HasAVX] in {
64006444 let isCommutable = 0 in {
64016445 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
65066550 VEX_4V, VEX_L, VEX_WIG;
65076551 }
65086552
6509 // Emulate vXi32/vXi64 blends with vXf32/vXf64.
6553 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
65106554 // ExecutionDomainFixPass will cleanup domains later on.
65116555 let Predicates = [HasAVX] in {
65126556 def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
65166560 def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
65176561 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
65186562
6563 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6564 // it from becoming movsd via commuting under optsize.
65196565 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
6520 (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>;
6566 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
65216567 def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
6522 (VBLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>;
6568 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
65236569 def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
6524 (VBLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>;
6570 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
65256571 }
65266572
65276573 let Predicates = [HasAVX1Only] in {
65326578 def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
65336579 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
65346580
6581 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6582 // it from becoming movss via commuting under optsize.
65356583 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
6536 (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>;
6584 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
65376585 def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
6538 (VBLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>;
6586 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
65396587 def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
6540 (VBLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
6588 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
65416589 }
65426590
65436591 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
65516599 SchedWriteBlend.XMM, BlendCommuteImm8>;
65526600
65536601 let Predicates = [UseSSE41] in {
6602 // Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6603 // it from becoming movss via commuting under optsize.
65546604 def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
6555 (BLENDPDrri VR128:$src1, VR128:$src2, imm:$src3)>;
6605 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
65566606 def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
6557 (BLENDPDrmi VR128:$src1, addr:$src2, imm:$src3)>;
6607 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
65586608 def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
6559 (BLENDPDrmi VR128:$src1, addr:$src2, (BlendCommuteImm2 imm:$src3))>;
6609 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
65606610
65616611 def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
6562 (BLENDPSrri VR128:$src1, VR128:$src2, imm:$src3)>;
6612 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
65636613 def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
6564 (BLENDPSrmi VR128:$src1, addr:$src2, imm:$src3)>;
6614 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
65656615 def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
6566 (BLENDPSrmi VR128:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
6616 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
65676617 }
65686618
65696619 // For insertion into the zero index (low half) of a 256-bit vector, it is
19121912 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
19131913 ; CHECK: # %bb.0:
19141914 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2
1915 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
1915 ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],mem[4,5,6,7]
19161916 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
19171917 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
19181918 ; CHECK-NEXT: retq
19271927 ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
19281928 ; CHECK: # %bb.0:
19291929 ; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1
1930 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
1930 ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],mem[4,5,6,7]
19311931 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
19321932 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
19331933 ; CHECK-NEXT: retq
25632563 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
25642564 ; CHECK: # %bb.0:
25652565 ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2
2566 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
2566 ; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],mem[4,5,6,7]
25672567 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
25682568 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
25692569 ; CHECK-NEXT: retq
25782578 ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
25792579 ; CHECK: # %bb.0:
25802580 ; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1
2581 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
2581 ; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],mem[4,5,6,7]
25822582 ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
25832583 ; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
25842584 ; CHECK-NEXT: retq
15791579 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,2305843009213693952]
15801580 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
15811581 ; AVX2-NEXT: vpsubq %xmm2, %xmm1, %xmm1
1582 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1582 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
15831583 ; AVX2-NEXT: retq
15841584 ;
15851585 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
15921592 ; AVX512F-NEXT: vpsrlvq {{.*}}(%rip), %xmm2, %xmm2
15931593 ; AVX512F-NEXT: vpaddq %xmm2, %xmm0, %xmm2
15941594 ; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1
1595 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1595 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
15961596 ; AVX512F-NEXT: vzeroupper
15971597 ; AVX512F-NEXT: retq
15981598 ;
16051605 ; AVX512BW-NEXT: vmovq %rax, %xmm2
16061606 ; AVX512BW-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
16071607 ; AVX512BW-NEXT: vpsravq %xmm2, %xmm1, %xmm1
1608 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1608 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
16091609 ; AVX512BW-NEXT: retq
16101610 ;
16111611 ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
24972497 ; AVX2-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
24982498 ; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1
24992499 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2
2500 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2500 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
25012501 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
25022502 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
25032503 ; AVX2-NEXT: retq
25072507 ; AVX512F-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
25082508 ; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1
25092509 ; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2
2510 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2510 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
25112511 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
25122512 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
25132513 ; AVX512F-NEXT: retq
25792579 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
25802580 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
25812581 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2582 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2582 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
25832583 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
25842584 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
25852585 ; AVX2-NEXT: retq
25902590 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
25912591 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
25922592 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2593 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2593 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
25942594 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
25952595 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
25962596 ; AVX512F-NEXT: retq
26642664 ; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
26652665 ; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1
26662666 ; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2
2667 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2667 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
26682668 ; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
26692669 ; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0
26702670 ; AVX2-NEXT: retq
26752675 ; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0
26762676 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1
26772677 ; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2
2678 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2678 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
26792679 ; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0
26802680 ; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0
26812681 ; AVX512F-NEXT: retq
29482948 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
29492949 ; SSE41-NEXT: retq
29502950 ;
2951 ; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
2952 ; AVX1: # %bb.0:
2953 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2954 ; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2955 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2956 ; AVX1-NEXT: retq
2957 ;
2958 ; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
2959 ; AVX2ORLATER: # %bb.0:
2960 ; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1
2961 ; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2962 ; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2963 ; AVX2ORLATER-NEXT: retq
2964 ;
2965 ; XOP-LABEL: combine_vec_sdiv_nonuniform7:
2966 ; XOP: # %bb.0:
2967 ; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
2968 ; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2969 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2970 ; XOP-NEXT: retq
2951 ; AVX-LABEL: combine_vec_sdiv_nonuniform7:
2952 ; AVX: # %bb.0:
2953 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
2954 ; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
2955 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2956 ; AVX-NEXT: retq
29712957 %1 = sdiv <8 x i16> %x,
29722958 ret <8 x i16> %1
29732959 }
5353 define void @baz(<2 x i64>* %arg, %struct.spam* %arg1) optsize {
5454 ; CHECK-LABEL: baz:
5555 ; CHECK: # %bb.0: # %bb
56 ; CHECK-NEXT: movapd (%rdi), %xmm0
57 ; CHECK-NEXT: movapd {{.*#+}} xmm1 = [3,3]
58 ; CHECK-NEXT: andpd %xmm0, %xmm1
59 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
60 ; CHECK-NEXT: movupd %xmm1, (%rsi)
56 ; CHECK-NEXT: movaps (%rdi), %xmm0
57 ; CHECK-NEXT: movaps {{.*#+}} xmm1 = [3,3]
58 ; CHECK-NEXT: andps %xmm0, %xmm1
59 ; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
60 ; CHECK-NEXT: movups %xmm1, (%rsi)
6161 ; CHECK-NEXT: retq
6262 bb:
6363 %tmp = load <2 x i64>, <2 x i64>* %arg, align 16
2929 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
3030 ; SSE41-NEXT: retq
3131 ;
32 ; AVX1-LABEL: insert_v2i64_x1:
33 ; AVX1: # %bb.0:
34 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
35 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
36 ; AVX1-NEXT: retq
37 ;
38 ; AVX2-LABEL: insert_v2i64_x1:
39 ; AVX2: # %bb.0:
40 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
41 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
42 ; AVX2-NEXT: retq
43 ;
44 ; AVX512-LABEL: insert_v2i64_x1:
45 ; AVX512: # %bb.0:
46 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
47 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
48 ; AVX512-NEXT: retq
32 ; AVX-LABEL: insert_v2i64_x1:
33 ; AVX: # %bb.0:
34 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
35 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
36 ; AVX-NEXT: retq
4937 %1 = insertelement <2 x i64> %a, i64 -1, i32 0
5038 ret <2 x i64> %1
5139 }
4242 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
4343 ; SSE-NEXT: retq
4444 ;
45 ; AVX1-LABEL: test2:
46 ; AVX1: # %bb.0:
47 ; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
48 ; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
49 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
50 ; AVX1-NEXT: retq
51 ;
52 ; AVX2-LABEL: test2:
53 ; AVX2: # %bb.0:
54 ; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
55 ; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
56 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
57 ; AVX2-NEXT: retq
45 ; AVX-LABEL: test2:
46 ; AVX: # %bb.0:
47 ; AVX-NEXT: vpsrlw $2, %xmm0, %xmm1
48 ; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
49 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
50 ; AVX-NEXT: retq
5851 %lshr = lshr <8 x i16> %a,
5952 ret <8 x i16> %lshr
6053 }
142135 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
143136 ; SSE-NEXT: retq
144137 ;
145 ; AVX1-LABEL: test6:
146 ; AVX1: # %bb.0:
147 ; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
148 ; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
149 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
150 ; AVX1-NEXT: retq
151 ;
152 ; AVX2-LABEL: test6:
153 ; AVX2: # %bb.0:
154 ; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1
155 ; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0
156 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
157 ; AVX2-NEXT: retq
138 ; AVX-LABEL: test6:
139 ; AVX: # %bb.0:
140 ; AVX-NEXT: vpsraw $2, %xmm0, %xmm1
141 ; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
142 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
143 ; AVX-NEXT: retq
158144 %lshr = ashr <8 x i16> %a,
159145 ret <8 x i16> %lshr
160146 }
693693 ; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
694694 ; X64-SSE-NEXT: retq
695695 ;
696 ; X64-AVX1-LABEL: PR19721:
697 ; X64-AVX1: # %bb.0:
698 ; X64-AVX1-NEXT: vmovq %xmm0, %rax
699 ; X64-AVX1-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
700 ; X64-AVX1-NEXT: andq %rax, %rcx
701 ; X64-AVX1-NEXT: vmovq %rcx, %xmm1
702 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
703 ; X64-AVX1-NEXT: retq
704 ;
705 ; X64-AVX512-LABEL: PR19721:
706 ; X64-AVX512: # %bb.0:
707 ; X64-AVX512-NEXT: vmovq %xmm0, %rax
708 ; X64-AVX512-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
709 ; X64-AVX512-NEXT: andq %rax, %rcx
710 ; X64-AVX512-NEXT: vmovq %rcx, %xmm1
711 ; X64-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
712 ; X64-AVX512-NEXT: retq
696 ; X64-AVX-LABEL: PR19721:
697 ; X64-AVX: # %bb.0:
698 ; X64-AVX-NEXT: vmovq %xmm0, %rax
699 ; X64-AVX-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
700 ; X64-AVX-NEXT: andq %rax, %rcx
701 ; X64-AVX-NEXT: vmovq %rcx, %xmm1
702 ; X64-AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
703 ; X64-AVX-NEXT: retq
713704 %bc = bitcast <4 x i32> %i to i128
714705 %insert = and i128 %bc, -4294967296
715706 %bc2 = bitcast i128 %insert to <4 x i32>
111111 ; SSE-NEXT: movaps %xmm2, %xmm0
112112 ; SSE-NEXT: retq
113113 ;
114 ; AVX1-LABEL: PR39893:
115 ; AVX1: # %bb.0:
116 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
117 ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
118 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
119 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
120 ; AVX1-NEXT: retq
121 ;
122 ; AVX2-LABEL: PR39893:
123 ; AVX2: # %bb.0:
124 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
125 ; AVX2-NEXT: vpsubd %xmm0, %xmm2, %xmm0
126 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
127 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
128 ; AVX2-NEXT: retq
129 ;
130 ; AVX512-LABEL: PR39893:
131 ; AVX512: # %bb.0:
132 ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
133 ; AVX512-NEXT: vpsubd %xmm0, %xmm2, %xmm0
134 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
135 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
136 ; AVX512-NEXT: retq
114 ; AVX-LABEL: PR39893:
115 ; AVX: # %bb.0:
116 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
117 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
118 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2],zero,xmm0[3],zero,xmm0[2],zero,xmm0[3],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero
119 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
120 ; AVX-NEXT: retq
137121 %sub = sub <2 x i32> , %x
138122 %bc = bitcast <2 x i32> %sub to <8 x i8>
139123 %shuffle = shufflevector <8 x i8> %y, <8 x i8> %bc, <2 x i32>
21352135 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
21362136 ; SSE41-NEXT: retq
21372137 ;
2138 ; AVX1-LABEL: insert_reg_lo_v4i32:
2139 ; AVX1: # %bb.0:
2140 ; AVX1-NEXT: vmovq %rdi, %xmm1
2141 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2142 ; AVX1-NEXT: retq
2143 ;
2144 ; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
2145 ; AVX2OR512VL: # %bb.0:
2146 ; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1
2147 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2148 ; AVX2OR512VL-NEXT: retq
2138 ; AVX-LABEL: insert_reg_lo_v4i32:
2139 ; AVX: # %bb.0:
2140 ; AVX-NEXT: vmovq %rdi, %xmm1
2141 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2142 ; AVX-NEXT: retq
21492143 %a.cast = bitcast i64 %a to <2 x i32>
21502144 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32>
21512145 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32>
11541154 ; AVX2-SLOW: # %bb.0:
11551155 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
11561156 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
1157 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1157 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
11581158 ; AVX2-SLOW-NEXT: retq
11591159 ;
11601160 ; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf:
11681168 ; AVX512VL-SLOW: # %bb.0:
11691169 ; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
11701170 ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
1171 ; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1171 ; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
11721172 ; AVX512VL-SLOW-NEXT: retq
11731173 ;
11741174 ; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf:
12651265 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
12661266 ; SSE41-NEXT: retq
12671267 ;
1268 ; AVX1-LABEL: shuffle_v8i16_032dXXXX:
1269 ; AVX1: # %bb.0:
1270 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1271 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
1272 ; AVX1-NEXT: retq
1273 ;
1274 ; AVX2OR512VL-LABEL: shuffle_v8i16_032dXXXX:
1275 ; AVX2OR512VL: # %bb.0:
1276 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1277 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
1278 ; AVX2OR512VL-NEXT: retq
1268 ; AVX-LABEL: shuffle_v8i16_032dXXXX:
1269 ; AVX: # %bb.0:
1270 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1271 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
1272 ; AVX-NEXT: retq
12791273 %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32>
12801274 ret <8 x i16> %shuffle
12811275 }
14231417 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
14241418 ; SSE41-NEXT: retq
14251419 ;
1426 ; AVX1-LABEL: shuffle_v8i16_012dcde3:
1427 ; AVX1: # %bb.0:
1428 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1429 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
1430 ; AVX1-NEXT: retq
1431 ;
1432 ; AVX2OR512VL-LABEL: shuffle_v8i16_012dcde3:
1433 ; AVX2OR512VL: # %bb.0:
1434 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1435 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
1436 ; AVX2OR512VL-NEXT: retq
1420 ; AVX-LABEL: shuffle_v8i16_012dcde3:
1421 ; AVX: # %bb.0:
1422 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1423 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
1424 ; AVX-NEXT: retq
14371425 %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32>
14381426 ret <8 x i16> %shuffle
14391427 }
15601548 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
15611549 ; SSE41-NEXT: retq
15621550 ;
1563 ; AVX1-LABEL: shuffle_v8i16_XX4X8acX:
1564 ; AVX1: # %bb.0:
1565 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
1566 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
1567 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1568 ; AVX1-NEXT: retq
1569 ;
1570 ; AVX2OR512VL-LABEL: shuffle_v8i16_XX4X8acX:
1571 ; AVX2OR512VL: # %bb.0:
1572 ; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
1573 ; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
1574 ; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1575 ; AVX2OR512VL-NEXT: retq
1551 ; AVX-LABEL: shuffle_v8i16_XX4X8acX:
1552 ; AVX: # %bb.0:
1553 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
1554 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
1555 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1556 ; AVX-NEXT: retq
15761557 %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32>
15771558 ret <8 x i16> %shuffle
15781559 }