llvm.org GIT mirror llvm / 4ad0654
[X86][SSE] Enable commutation for SSE immediate blend instructions Patch to allow (v)blendps, (v)blendpd, (v)pblendw and vpblendd instructions to be commuted - swaps the src registers and inverts the blend mask. This is primarily to improve memory folding (see new tests), but it also improves the quality of shuffles (see modified tests). Differential Revision: http://reviews.llvm.org/D6015 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@221313 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 5 years ago
8 changed file(s) with 247 addition(s) and 110 deletion(s). Raw diff Collapse all Expand all
24192419 MI->getOperand(3).setImm(Size-Amt);
24202420 return TargetInstrInfo::commuteInstruction(MI, NewMI);
24212421 }
2422 case X86::BLENDPDrri:
2423 case X86::BLENDPSrri:
2424 case X86::PBLENDWrri:
2425 case X86::VBLENDPDrri:
2426 case X86::VBLENDPSrri:
2427 case X86::VBLENDPDYrri:
2428 case X86::VBLENDPSYrri:
2429 case X86::VPBLENDDrri:
2430 case X86::VPBLENDWrri:
2431 case X86::VPBLENDDYrri:
2432 case X86::VPBLENDWYrri:{
2433 unsigned Mask;
2434 switch (MI->getOpcode()) {
2435 default: llvm_unreachable("Unreachable!");
2436 case X86::BLENDPDrri: Mask = 0x03; break;
2437 case X86::BLENDPSrri: Mask = 0x0F; break;
2438 case X86::PBLENDWrri: Mask = 0xFF; break;
2439 case X86::VBLENDPDrri: Mask = 0x03; break;
2440 case X86::VBLENDPSrri: Mask = 0x0F; break;
2441 case X86::VBLENDPDYrri: Mask = 0x0F; break;
2442 case X86::VBLENDPSYrri: Mask = 0xFF; break;
2443 case X86::VPBLENDDrri: Mask = 0x0F; break;
2444 case X86::VPBLENDWrri: Mask = 0xFF; break;
2445 case X86::VPBLENDDYrri: Mask = 0xFF; break;
2446 case X86::VPBLENDWYrri: Mask = 0xFF; break;
2447 }
2448 unsigned Imm = MI->getOperand(3).getImm();
2449 if (NewMI) {
2450 MachineFunction &MF = *MI->getParent()->getParent();
2451 MI = MF.CloneMachineInstr(MI);
2452 NewMI = false;
2453 }
2454 MI->getOperand(3).setImm(Mask ^ Imm);
2455 return TargetInstrInfo::commuteInstruction(MI, NewMI);
2456 }
24222457 case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
24232458 case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
24242459 case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
25032538 bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
25042539 unsigned &SrcOpIdx2) const {
25052540 switch (MI->getOpcode()) {
2541 case X86::BLENDPDrri:
2542 case X86::BLENDPSrri:
2543 case X86::PBLENDWrri:
2544 case X86::VBLENDPDrri:
2545 case X86::VBLENDPSrri:
2546 case X86::VBLENDPDYrri:
2547 case X86::VBLENDPSYrri:
2548 case X86::VPBLENDDrri:
2549 case X86::VPBLENDDYrri:
2550 case X86::VPBLENDWrri:
2551 case X86::VPBLENDWYrri:
2552 SrcOpIdx1 = 1;
2553 SrcOpIdx2 = 2;
2554 return true;
25062555 case X86::VFMADDPDr231r:
25072556 case X86::VFMADDPSr231r:
25082557 case X86::VFMADDSDr231r:
75367536
75377537 let Predicates = [HasAVX] in {
75387538 let isCommutable = 0 in {
7539 let ExeDomain = SSEPackedSingle in {
7540 defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
7541 VR128, loadv4f32, f128mem, 0,
7542 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7543 defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
7544 int_x86_avx_blend_ps_256, VR256, loadv8f32,
7545 f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
7546 VEX_4V, VEX_L;
7547 }
7548 let ExeDomain = SSEPackedDouble in {
7549 defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
7550 VR128, loadv2f64, f128mem, 0,
7551 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7552 defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
7553 int_x86_avx_blend_pd_256,VR256, loadv4f64,
7554 f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
7555 VEX_4V, VEX_L;
7556 }
7539 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
7540 VR128, loadv2i64, i128mem, 0,
7541 DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
7542 }
7543
7544 let ExeDomain = SSEPackedSingle in {
7545 defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
7546 VR128, loadv4f32, f128mem, 0,
7547 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7548 defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
7549 int_x86_avx_blend_ps_256, VR256, loadv8f32,
7550 f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
7551 VEX_4V, VEX_L;
7552 }
7553 let ExeDomain = SSEPackedDouble in {
7554 defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
7555 VR128, loadv2f64, f128mem, 0,
7556 DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7557 defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
7558 int_x86_avx_blend_pd_256,VR256, loadv4f64,
7559 f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
7560 VEX_4V, VEX_L;
7561 }
75577562 defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
75587563 VR128, loadv2i64, i128mem, 0,
75597564 DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
7560 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
7561 VR128, loadv2i64, i128mem, 0,
7562 DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
7563 }
7565
75647566 let ExeDomain = SSEPackedSingle in
75657567 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
75667568 VR128, loadv4f32, f128mem, 0,
75887590
75897591 let Constraints = "$src1 = $dst" in {
75907592 let isCommutable = 0 in {
7593 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
7594 VR128, memopv2i64, i128mem,
7595 1, SSE_MPSADBW_ITINS>;
7596 }
75917597 let ExeDomain = SSEPackedSingle in
75927598 defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
75937599 VR128, memopv4f32, f128mem,
75997605 defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
76007606 VR128, memopv2i64, i128mem,
76017607 1, SSE_INTALU_ITINS_BLEND_P>;
7602 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
7603 VR128, memopv2i64, i128mem,
7604 1, SSE_MPSADBW_ITINS>;
7605 }
76067608 let ExeDomain = SSEPackedSingle in
76077609 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
76087610 VR128, memopv4f32, f128mem, 1,
88268828 Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
88278829 }
88288830
8829 let isCommutable = 0 in {
88308831 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
88318832 VR128, loadv2i64, i128mem>;
88328833 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
88338834 VR256, loadv4i64, i256mem>, VEX_L;
8834 }
88358835
88368836 def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
88378837 imm:$mask)),
1818 define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
1919 ; CHECK-LABEL: test2:
2020 ; CHECK: # BB#0:
21 ; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
22 ; CHECK-NEXT: movdqa %xmm1, %xmm0
21 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2322 ; CHECK-NEXT: retq
2423 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
2524 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32>
3130 define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
3231 ; CHECK-LABEL: test3:
3332 ; CHECK: # BB#0:
34 ; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
35 ; CHECK-NEXT: movdqa %xmm1, %xmm0
33 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
3634 ; CHECK-NEXT: retq
3735 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32>
3836 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32>
4442 define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
4543 ; CHECK-LABEL: test4:
4644 ; CHECK: # BB#0:
47 ; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
48 ; CHECK-NEXT: movdqa %xmm1, %xmm0
45 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
4946 ; CHECK-NEXT: retq
5047 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
5148 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32>
105102 define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
106103 ; CHECK-LABEL: test9:
107104 ; CHECK: # BB#0:
108 ; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
109 ; CHECK-NEXT: movdqa %xmm1, %xmm0
105 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
110106 ; CHECK-NEXT: retq
111107 %and1 = and <4 x i32> %a,
112108 %and2 = and <4 x i32> %b,
118114 define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
119115 ; CHECK-LABEL: test10:
120116 ; CHECK: # BB#0:
121 ; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
122 ; CHECK-NEXT: movdqa %xmm1, %xmm0
117 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
123118 ; CHECK-NEXT: retq
124119 %and1 = and <2 x i64> %a,
125120 %and2 = and <2 x i64> %b,
131126 define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
132127 ; CHECK-LABEL: test11:
133128 ; CHECK: # BB#0:
134 ; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
135 ; CHECK-NEXT: movdqa %xmm1, %xmm0
129 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
136130 ; CHECK-NEXT: retq
137131 %and1 = and <4 x i32> %a,
138132 %and2 = and <4 x i32> %b,
229223 ; CHECK-LABEL: test18:
230224 ; CHECK: # BB#0:
231225 ; CHECK-NEXT: xorps %xmm2, %xmm2
232 ; CHECK-NEXT: xorps %xmm3, %xmm3
233 ; CHECK-NEXT: blendps {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
234 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,1,1]
235 ; CHECK-NEXT: blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
236 ; CHECK-NEXT: orps %xmm0, %xmm2
237 ; CHECK-NEXT: movaps %xmm2, %xmm0
226 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
227 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
228 ; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
229 ; CHECK-NEXT: por %xmm1, %xmm0
238230 ; CHECK-NEXT: retq
239231 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
240232 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32>
294286 define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
295287 ; CHECK-LABEL: test_crash:
296288 ; CHECK: # BB#0:
297 ; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
298 ; CHECK-NEXT: movdqa %xmm1, %xmm0
289 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
299290 ; CHECK-NEXT: retq
300291 %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32>
301292 %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32>
0 ; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s
1
2 define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 {
3 %1 = load <8 x i16>* %b
4 %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
5 ret <8 x i16> %2
6
7 ;LABEL: commute_fold_vpblendw_128
8 ;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
9 ;CHECK-NEXT: retq
10 }
11 declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
12
13 define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 {
14 %1 = load <16 x i16>* %b
15 %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17)
16 ret <16 x i16> %2
17
18 ;LABEL: commute_fold_vpblendw_256
19 ;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
20 ;CHECK-NEXT: retq
21 }
22 declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone
23
24 define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 {
25 %1 = load <4 x i32>* %b
26 %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1)
27 ret <4 x i32> %2
28
29 ;LABEL: commute_fold_vpblendd_128
30 ;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
31 ;CHECK-NEXT: retq
32 }
33 declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone
34
35 define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 {
36 %1 = load <8 x i32>* %b
37 %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129)
38 ret <8 x i32> %2
39
40 ;LABEL: commute_fold_vpblendd_256
41 ;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
42 ;CHECK-NEXT: retq
43 }
44 declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
45
46 define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 {
47 %1 = load <4 x float>* %b
48 %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
49 ret <4 x float> %2
50
51 ;LABEL: commute_fold_vblendps_128
52 ;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
53 ;CHECK-NEXT: retq
54 }
55 declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
56
57 define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 {
58 %1 = load <8 x float>* %b
59 %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7)
60 ret <8 x float> %2
61
62 ;LABEL: commute_fold_vblendps_256
63 ;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
64 ;CHECK-NEXT: retq
65 }
66 declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
67
68 define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
69 %1 = load <2 x double>* %b
70 %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
71 ret <2 x double> %2
72
73 ;LABEL: commute_fold_vblendpd_128
74 ;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
75 ;CHECK-NEXT: retq
76 }
77 declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
78
79 define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
80 %1 = load <4 x double>* %b
81 %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
82 ret <4 x double> %2
83
84 ;LABEL: commute_fold_vblendpd_256
85 ;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
86 ;CHECK-NEXT: retq
87 }
88 declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
0 ; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s
1
2 define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
3 %1 = load <8 x i16>* %b
4 %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17)
5 ret <8 x i16> %2
6
7 ;LABEL: commute_fold_pblendw
8 ;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
9 ;CHECK-NEXT: retq
10 }
11 declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
12
13 define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
14 %1 = load <4 x float>* %b
15 %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3)
16 ret <4 x float> %2
17
18 ;LABEL: commute_fold_blendps
19 ;CHECK: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
20 ;CHECK-NEXT: retq
21 }
22 declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
23
24 define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
25 %1 = load <2 x double>* %b
26 %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
27 ret <2 x double> %2
28
29 ;LABEL: commute_fold_vblendpd
30 ;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
31 ;CHECK-NEXT: retq
32 }
33 declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
201201 ;
202202 ; SSE41-LABEL: vsel_8xi16:
203203 ; SSE41: # BB#0: # %entry
204 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
205 ; SSE41-NEXT: movdqa %xmm1, %xmm0
204 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
206205 ; SSE41-NEXT: retq
207206 ;
208207 ; AVX-LABEL: vsel_8xi16:
517516 ;
518517 ; SSE41-LABEL: constant_blendvps_avx:
519518 ; SSE41: # BB#0: # %entry
520 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3]
521 ; SSE41-NEXT: blendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3]
522 ; SSE41-NEXT: movaps %xmm2, %xmm0
523 ; SSE41-NEXT: movaps %xmm3, %xmm1
519 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
520 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
524521 ; SSE41-NEXT: retq
525522 ;
526523 ; AVX-LABEL: constant_blendvps_avx:
636633 ;
637634 ; SSE41-LABEL: blend_shufflevector_8xfloat:
638635 ; SSE41: # BB#0: # %entry
639 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
640 ; SSE41-NEXT: blendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
641 ; SSE41-NEXT: movaps %xmm2, %xmm0
642 ; SSE41-NEXT: movaps %xmm3, %xmm1
636 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
637 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
643638 ; SSE41-NEXT: retq
644639 ;
645640 ; AVX-LABEL: blend_shufflevector_8xfloat:
693688 ;
694689 ; SSE41-LABEL: blend_shufflevector_4xi64:
695690 ; SSE41: # BB#0: # %entry
696 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
697 ; SSE41-NEXT: movdqa %xmm2, %xmm0
691 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
698692 ; SSE41-NEXT: movaps %xmm3, %xmm1
699693 ; SSE41-NEXT: retq
700694 ;
259259 ;
260260 ; SSE41-LABEL: shuffle_v2f64_21:
261261 ; SSE41: # BB#0:
262 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
263 ; SSE41-NEXT: movapd %xmm1, %xmm0
262 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
264263 ; SSE41-NEXT: retq
265264 ;
266265 ; AVX-LABEL: shuffle_v2f64_21:
507506 ;
508507 ; SSE41-LABEL: shuffle_v2i64_21:
509508 ; SSE41: # BB#0:
510 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
511 ; SSE41-NEXT: movdqa %xmm1, %xmm0
509 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
512510 ; SSE41-NEXT: retq
513511 ;
514512 ; AVX1-LABEL: shuffle_v2i64_21:
544542 ;
545543 ; SSE41-LABEL: shuffle_v2i64_21_copy:
546544 ; SSE41: # BB#0:
547 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
548 ; SSE41-NEXT: movdqa %xmm2, %xmm0
545 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
546 ; SSE41-NEXT: movdqa %xmm1, %xmm0
549547 ; SSE41-NEXT: retq
550548 ;
551549 ; AVX1-LABEL: shuffle_v2i64_21_copy:
368368 ; SSE41: # BB#0:
369369 ; SSE41-NEXT: pxor %xmm1, %xmm0
370370 ; SSE41-NEXT: pxor %xmm1, %xmm1
371 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
371 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
372 ; SSE41-NEXT: movdqa %xmm1, %xmm0
372373 ; SSE41-NEXT: retq
373374 ;
374375 ; AVX1-LABEL: combine_bitwise_ops_test3b:
410411 ; SSE41-LABEL: combine_bitwise_ops_test4b:
411412 ; SSE41: # BB#0:
412413 ; SSE41-NEXT: pand %xmm1, %xmm0
413 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
414 ; SSE41-NEXT: movdqa %xmm2, %xmm0
414 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
415415 ; SSE41-NEXT: retq
416416 ;
417417 ; AVX1-LABEL: combine_bitwise_ops_test4b:
451451 ; SSE41-LABEL: combine_bitwise_ops_test5b:
452452 ; SSE41: # BB#0:
453453 ; SSE41-NEXT: por %xmm1, %xmm0
454 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
455 ; SSE41-NEXT: movdqa %xmm2, %xmm0
454 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
456455 ; SSE41-NEXT: retq
457456 ;
458457 ; AVX1-LABEL: combine_bitwise_ops_test5b:
11691168 ;
11701169 ; SSE41-LABEL: combine_test2:
11711170 ; SSE41: # BB#0:
1172 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1173 ; SSE41-NEXT: movaps %xmm1, %xmm0
1171 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
11741172 ; SSE41-NEXT: retq
11751173 ;
11761174 ; AVX-LABEL: combine_test2:
12361234 ;
12371235 ; SSE41-LABEL: combine_test5:
12381236 ; SSE41: # BB#0:
1239 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3]
1240 ; SSE41-NEXT: movaps %xmm1, %xmm0
1237 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
12411238 ; SSE41-NEXT: retq
12421239 ;
12431240 ; AVX-LABEL: combine_test5:
12981295 ;
12991296 ; SSE41-LABEL: combine_test7:
13001297 ; SSE41: # BB#0:
1301 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1302 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1298 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
13031299 ; SSE41-NEXT: retq
13041300 ;
13051301 ; AVX1-LABEL: combine_test7:
13701366 ;
13711367 ; SSE41-LABEL: combine_test10:
13721368 ; SSE41: # BB#0:
1373 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1374 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1369 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
13751370 ; SSE41-NEXT: retq
13761371 ;
13771372 ; AVX1-LABEL: combine_test10:
14141409 ;
14151410 ; SSE41-LABEL: combine_test12:
14161411 ; SSE41: # BB#0:
1417 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1418 ; SSE41-NEXT: movaps %xmm1, %xmm0
1412 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
14191413 ; SSE41-NEXT: retq
14201414 ;
14211415 ; AVX-LABEL: combine_test12:
14781472 ;
14791473 ; SSE41-LABEL: combine_test15:
14801474 ; SSE41: # BB#0:
1481 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3]
1482 ; SSE41-NEXT: movaps %xmm1, %xmm0
1475 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
14831476 ; SSE41-NEXT: retq
14841477 ;
14851478 ; AVX-LABEL: combine_test15:
15171510 ;
15181511 ; SSE41-LABEL: combine_test17:
15191512 ; SSE41: # BB#0:
1520 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1521 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1513 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
15221514 ; SSE41-NEXT: retq
15231515 ;
15241516 ; AVX1-LABEL: combine_test17:
15861578 ;
15871579 ; SSE41-LABEL: combine_test20:
15881580 ; SSE41: # BB#0:
1589 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1590 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1581 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
15911582 ; SSE41-NEXT: retq
15921583 ;
15931584 ; AVX1-LABEL: combine_test20:
16311622 ;
16321623 ; SSE41-LABEL: combine_test1b:
16331624 ; SSE41: # BB#0:
1634 ; SSE41-NEXT: movaps %xmm1, %xmm2
1635 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
1636 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
1637 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
1625 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1626 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1627 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,0]
16381628 ; SSE41-NEXT: movaps %xmm1, %xmm0
16391629 ; SSE41-NEXT: retq
16401630 ;
16721662 ;
16731663 ; SSE41-LABEL: combine_test2b:
16741664 ; SSE41: # BB#0:
1675 ; SSE41-NEXT: movaps %xmm1, %xmm2
1676 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
1677 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[1,1]
1678 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
1679 ; SSE41-NEXT: movaps %xmm2, %xmm0
1665 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1666 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,1]
1667 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
16801668 ; SSE41-NEXT: retq
16811669 ;
16821670 ; AVX-LABEL: combine_test2b:
17351723 ;
17361724 ; SSE41-LABEL: combine_test4b:
17371725 ; SSE41: # BB#0:
1738 ; SSE41-NEXT: movaps %xmm1, %xmm2
1739 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
1740 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
1741 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[0,2]
1726 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1727 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
1728 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[0,2]
17421729 ; SSE41-NEXT: movaps %xmm1, %xmm0
17431730 ; SSE41-NEXT: retq
17441731 ;
20051992 ;
20061993 ; SSE41-LABEL: combine_blend_01:
20071994 ; SSE41: # BB#0:
2008 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
2009 ; SSE41-NEXT: movapd %xmm1, %xmm0
1995 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
20101996 ; SSE41-NEXT: retq
20111997 ;
20121998 ; AVX-LABEL: combine_blend_01:
20352021 ;
20362022 ; SSE41-LABEL: combine_blend_02:
20372023 ; SSE41: # BB#0:
2038 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
2039 ; SSE41-NEXT: movaps %xmm1, %xmm0
2024 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
20402025 ; SSE41-NEXT: retq
20412026 ;
20422027 ; AVX-LABEL: combine_blend_02:
20692054 ;
20702055 ; SSE41-LABEL: combine_blend_123:
20712056 ; SSE41: # BB#0:
2072 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2073 ; SSE41-NEXT: movaps %xmm1, %xmm0
2057 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
20742058 ; SSE41-NEXT: retq
20752059 ;
20762060 ; AVX-LABEL: combine_blend_123:
21522136 ;
21532137 ; SSE41-LABEL: combine_undef_input_test1:
21542138 ; SSE41: # BB#0:
2155 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
2156 ; SSE41-NEXT: movapd %xmm1, %xmm0
2139 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
21572140 ; SSE41-NEXT: retq
21582141 ;
21592142 ; AVX-LABEL: combine_undef_input_test1:
23422325 ;
23432326 ; SSE41-LABEL: combine_undef_input_test11:
23442327 ; SSE41: # BB#0:
2345 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
2346 ; SSE41-NEXT: movapd %xmm1, %xmm0
2328 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
23472329 ; SSE41-NEXT: retq
23482330 ;
23492331 ; AVX-LABEL: combine_undef_input_test11: