llvm.org GIT mirror llvm / 52f615d
[X86][SSE] Add custom execution domain fixing for BLENDPD/BLENDPS/PBLENDD/PBLENDW (PR34873) Add support for custom execution domain fixing and implement support for BLENDPD/BLENDPS/PBLENDD/PBLENDW. Differential Revision: https://reviews.llvm.org/D42042 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@322524 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
51 changed file(s) with 1081 addition(s) and 1227 deletion(s). Raw diff Collapse all Expand all
96939693 { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
96949694 { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
96959695 { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
9696 { X86::VBLENDPSrri, X86::VBLENDPSrri, X86::VPBLENDDrri },
9697 { X86::VBLENDPSrmi, X86::VBLENDPSrmi, X86::VPBLENDDrmi },
96989696 { X86::VBLENDPSYrri, X86::VBLENDPSYrri, X86::VPBLENDDYrri },
96999697 { X86::VBLENDPSYrmi, X86::VBLENDPSYrmi, X86::VPBLENDDYrmi },
97009698 { X86::VPERMILPSYmi, X86::VPERMILPSYmi, X86::VPSHUFDYmi },
99489946 X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
99499947 };
99509948
9949 // NOTE: These should only be used by the custom domain methods.
9950 static const uint16_t ReplaceableCustomInstrs[][3] = {
9951 //PackedSingle PackedDouble PackedInt
9952 { X86::BLENDPSrmi, X86::BLENDPDrmi, X86::PBLENDWrmi },
9953 { X86::BLENDPSrri, X86::BLENDPDrri, X86::PBLENDWrri },
9954 { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDWrmi },
9955 { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDWrri },
9956 { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDWYrmi },
9957 { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDWYrri },
9958 };
9959 static const uint16_t ReplaceableCustomAVX2Instrs[][3] = {
9960 //PackedSingle PackedDouble PackedInt
9961 { X86::VBLENDPSrmi, X86::VBLENDPDrmi, X86::VPBLENDDrmi },
9962 { X86::VBLENDPSrri, X86::VBLENDPDrri, X86::VPBLENDDrri },
9963 { X86::VBLENDPSYrmi, X86::VBLENDPDYrmi, X86::VPBLENDDYrmi },
9964 { X86::VBLENDPSYrri, X86::VBLENDPDYrri, X86::VPBLENDDYrri },
9965 };
9966
99519967 // FIXME: Some shuffle and unpack instructions have equivalents in different
99529968 // domains, but they require a bit more work than just switching opcodes.
99539969
99689984 return nullptr;
99699985 }
99709986
9987 // Helper to attempt to widen/narrow blend masks.
9988 static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9989 unsigned NewWidth, unsigned *pNewMask = nullptr) {
9990 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9991 "Illegal blend mask scale");
9992 unsigned NewMask = 0;
9993
9994 if ((OldWidth % NewWidth) == 0) {
9995 unsigned Scale = OldWidth / NewWidth;
9996 unsigned SubMask = (1u << Scale) - 1;
9997 for (unsigned i = 0; i != NewWidth; ++i) {
9998 unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
9999 if (Sub == SubMask)
10000 NewMask |= (1u << i);
10001 else if (Sub != 0x0)
10002 return false;
10003 }
10004 } else {
10005 unsigned Scale = NewWidth / OldWidth;
10006 unsigned SubMask = (1u << Scale) - 1;
10007 for (unsigned i = 0; i != OldWidth; ++i) {
10008 if (OldMask & (1 << i)) {
10009 NewMask |= (SubMask << (i * Scale));
10010 }
10011 }
10012 }
10013
10014 if (pNewMask)
10015 *pNewMask = NewMask;
10016 return true;
10017 }
10018
10019 uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
10020 unsigned Opcode = MI.getOpcode();
10021 unsigned NumOperands = MI.getNumOperands();
10022
10023 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
10024 uint16_t validDomains = 0;
10025 if (MI.getOperand(NumOperands - 1).isImm()) {
10026 unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
10027 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
10028 validDomains |= 0x2; // PackedSingle
10029 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
10030 validDomains |= 0x4; // PackedDouble
10031 if (!Is256 || Subtarget.hasAVX2())
10032 validDomains |= 0x8; // PackedInt
10033 }
10034 return validDomains;
10035 };
10036
10037 switch (Opcode) {
10038 case X86::BLENDPDrmi:
10039 case X86::BLENDPDrri:
10040 case X86::VBLENDPDrmi:
10041 case X86::VBLENDPDrri:
10042 return GetBlendDomains(2, false);
10043 case X86::VBLENDPDYrmi:
10044 case X86::VBLENDPDYrri:
10045 return GetBlendDomains(4, true);
10046 case X86::BLENDPSrmi:
10047 case X86::BLENDPSrri:
10048 case X86::VBLENDPSrmi:
10049 case X86::VBLENDPSrri:
10050 case X86::VPBLENDDrmi:
10051 case X86::VPBLENDDrri:
10052 return GetBlendDomains(4, false);
10053 case X86::VBLENDPSYrmi:
10054 case X86::VBLENDPSYrri:
10055 case X86::VPBLENDDYrmi:
10056 case X86::VPBLENDDYrri:
10057 return GetBlendDomains(8, true);
10058 case X86::PBLENDWrmi:
10059 case X86::PBLENDWrri:
10060 case X86::VPBLENDWrmi:
10061 case X86::VPBLENDWrri:
10062 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
10063 case X86::VPBLENDWYrmi:
10064 case X86::VPBLENDWYrri:
10065 return GetBlendDomains(8, false);
10066 }
10067 return 0;
10068 }
10069
10070 bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
10071 unsigned Domain) const {
10072 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
10073 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
10074 assert(dom && "Not an SSE instruction");
10075
10076 unsigned Opcode = MI.getOpcode();
10077 unsigned NumOperands = MI.getNumOperands();
10078
10079 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
10080 if (MI.getOperand(NumOperands - 1).isImm()) {
10081 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
10082 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
10083 unsigned NewImm = Imm;
10084
10085 const uint16_t *table = lookup(Opcode, dom, ReplaceableCustomInstrs);
10086 if (!table)
10087 table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
10088
10089 if (Domain == 1) { // PackedSingle
10090 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
10091 } else if (Domain == 2) { // PackedDouble
10092 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
10093 } else if (Domain == 3) { // PackedInt
10094 if (Subtarget.hasAVX2()) {
10095 // If we are already VPBLENDW use that, else use VPBLENDD.
10096 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
10097 table = lookup(Opcode, dom, ReplaceableCustomAVX2Instrs);
10098 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
10099 }
10100 } else {
10101 assert(!Is256 && "128-bit vector expected");
10102 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
10103 }
10104 }
10105
10106 assert(table && table[Domain - 1] && "Unknown domain op");
10107 MI.setDesc(get(table[Domain - 1]));
10108 MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
10109 }
10110 return true;
10111 };
10112
10113 switch (Opcode) {
10114 case X86::BLENDPDrmi:
10115 case X86::BLENDPDrri:
10116 case X86::VBLENDPDrmi:
10117 case X86::VBLENDPDrri:
10118 return SetBlendDomain(2, false);
10119 case X86::VBLENDPDYrmi:
10120 case X86::VBLENDPDYrri:
10121 return SetBlendDomain(4, true);
10122 case X86::BLENDPSrmi:
10123 case X86::BLENDPSrri:
10124 case X86::VBLENDPSrmi:
10125 case X86::VBLENDPSrri:
10126 case X86::VPBLENDDrmi:
10127 case X86::VPBLENDDrri:
10128 return SetBlendDomain(4, false);
10129 case X86::VBLENDPSYrmi:
10130 case X86::VBLENDPSYrri:
10131 case X86::VPBLENDDYrmi:
10132 case X86::VPBLENDDYrri:
10133 return SetBlendDomain(8, true);
10134 case X86::PBLENDWrmi:
10135 case X86::PBLENDWrri:
10136 case X86::VPBLENDWrmi:
10137 case X86::VPBLENDWrri:
10138 return SetBlendDomain(8, false);
10139 case X86::VPBLENDWYrmi:
10140 case X86::VPBLENDWYrri:
10141 return SetBlendDomain(16, true);
10142 }
10143 return false;
10144 }
10145
997110146 std::pair
997210147 X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
997310148 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
997410149 unsigned opcode = MI.getOpcode();
997510150 uint16_t validDomains = 0;
997610151 if (domain) {
9977 if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
10152 // Attempt to match for custom instructions.
10153 if (validDomains = getExecutionDomainCustom(MI)) {
10154 return std::make_pair(domain, validDomains);
10155 }
10156
10157 if (lookup(opcode, domain, ReplaceableInstrs)) {
997810158 validDomains = 0xe;
997910159 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
998010160 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
1000610186 assert(Domain>0 && Domain<4 && "Invalid execution domain");
1000710187 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
1000810188 assert(dom && "Not an SSE instruction");
10189
10190 // Attempt to match for custom instructions.
10191 if (setExecutionDomainCustom(MI, Domain))
10192 return;
10193
1000910194 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
1001010195 if (!table) { // try the other table
1001110196 assert((Subtarget.hasAVX2() || Domain < 3) &&
489489 std::pair
490490 getExecutionDomain(const MachineInstr &MI) const override;
491491
492 uint16_t getExecutionDomainCustom(const MachineInstr &MI) const;
493
492494 void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
495
496 bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const;
493497
494498 unsigned
495499 getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
2020 ; AVX-LABEL: castB:
2121 ; AVX: ## %bb.0:
2222 ; AVX-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
23 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
24 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
23 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
24 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2525 ; AVX-NEXT: retq
2626 %shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32>
2727 ret <4 x double> %shuffle.i
3030 ; AVX2 is needed for integer types.
3131
3232 define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
33 ; AVX1-LABEL: castC:
34 ; AVX1: ## %bb.0:
35 ; AVX1-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
36 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
37 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
38 ; AVX1-NEXT: retq
39 ;
40 ; AVX2-LABEL: castC:
41 ; AVX2: ## %bb.0:
42 ; AVX2-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
43 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
44 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
45 ; AVX2-NEXT: retq
33 ; AVX-LABEL: castC:
34 ; AVX: ## %bb.0:
35 ; AVX-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
36 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
37 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
38 ; AVX-NEXT: retq
4639 %shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32>
4740 ret <4 x i64> %shuffle.i
4841 }
1515 ; ALL-LABEL: insert_f64:
1616 ; ALL: # %bb.0:
1717 ; ALL-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
18 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
18 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
1919 ; ALL-NEXT: retq
2020 %i0 = insertelement <4 x double> %y, double %f, i32 0
2121 ret <4 x double> %i0
140140 define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
141141 ; X32-LABEL: test_mm256_blend_pd:
142142 ; X32: # %bb.0:
143 ; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
143 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
144144 ; X32-NEXT: retl
145145 ;
146146 ; X64-LABEL: test_mm256_blend_pd:
147147 ; X64: # %bb.0:
148 ; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
148 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
149149 ; X64-NEXT: retq
150150 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32>
151151 ret <4 x double> %res
10431043 ; X32-LABEL: test_mm256_insertf128_pd:
10441044 ; X32: # %bb.0:
10451045 ; X32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
1046 ; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1046 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10471047 ; X32-NEXT: retl
10481048 ;
10491049 ; X64-LABEL: test_mm256_insertf128_pd:
10501050 ; X64: # %bb.0:
10511051 ; X64-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
1052 ; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1052 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10531053 ; X64-NEXT: retq
10541054 %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32>
10551055 %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32>
10751075 ; X32-LABEL: test_mm256_insertf128_si256:
10761076 ; X32: # %bb.0:
10771077 ; X32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
1078 ; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1078 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10791079 ; X32-NEXT: retl
10801080 ;
10811081 ; X64-LABEL: test_mm256_insertf128_si256:
10821082 ; X64: # %bb.0:
10831083 ; X64-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
1084 ; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1084 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10851085 ; X64-NEXT: retq
10861086 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32>
10871087 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32>
3939 ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
4040 ; CHECK: # %bb.0:
4141 ; CHECK-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
42 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
42 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4343 ; CHECK-NEXT: ret{{[l|q]}}
4444 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
4545 ret <8 x i32> %res
132132 define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
133133 ; CHECK-LABEL: test_x86_avx_blend_pd_256:
134134 ; CHECK: # %bb.0:
135 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
135 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
136136 ; CHECK-NEXT: ret{{[l|q]}}
137137 %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
138138 ret <4 x double> %res
187187 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
188188 ; CHECK-LABEL: test_x86_sse41_blendpd:
189189 ; CHECK: # %bb.0:
190 ; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
190 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
191191 ; CHECK-NEXT: ret{{[l|q]}}
192192 %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
193193 ret <2 x double> %res
3636 define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
3737 ; ALL-LABEL: shuffle_v8f32_0123cdef:
3838 ; ALL: # %bb.0: # %entry
39 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
39 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4040 ; ALL-NEXT: retq
4141 entry:
4242 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
379379 define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
380380 ; ALL-LABEL: shuffle_v4f64_zz23:
381381 ; ALL: # %bb.0:
382 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
383 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
382 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
383 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
384384 ; ALL-NEXT: retq
385385 %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32>
386386 ret <4 x double> %s
388388 define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
389389 ; ALL-LABEL: shuffle_v4f64_zz23_optsize:
390390 ; ALL: # %bb.0:
391 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
392 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
391 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
392 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
393393 ; ALL-NEXT: retq
394394 %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32>
395395 ret <4 x double> %s
415415 define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
416416 ; ALL-LABEL: shuffle_v4f64_zz67:
417417 ; ALL: # %bb.0:
418 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
419 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
418 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
419 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
420420 ; ALL-NEXT: retq
421421 %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32>
422422 ret <4 x double> %s
424424 define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
425425 ; ALL-LABEL: shuffle_v4f64_zz67_optsize:
426426 ; ALL: # %bb.0:
427 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
428 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
427 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
428 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
429429 ; ALL-NEXT: retq
430430 %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32>
431431 ret <4 x double> %s
434434 define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
435435 ; ALL-LABEL: shuffle_v4f64_01zz:
436436 ; ALL: # %bb.0:
437 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
438 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
437 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
438 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
439439 ; ALL-NEXT: retq
440440 %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32>
441441 ret <4 x double> %s
443443 define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
444444 ; ALL-LABEL: shuffle_v4f64_01zz_optsize:
445445 ; ALL: # %bb.0:
446 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
447 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
446 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
447 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
448448 ; ALL-NEXT: retq
449449 %s = shufflevector <4 x double> %a, <4 x double> , <4 x i32>
450450 ret <4 x double> %s
470470 define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
471471 ; ALL-LABEL: shuffle_v4f64_45zz:
472472 ; ALL: # %bb.0:
473 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
474 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
473 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
474 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
475475 ; ALL-NEXT: retq
476476 %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32>
477477 ret <4 x double> %s
479479 define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
480480 ; ALL-LABEL: shuffle_v4f64_45zz_optsize:
481481 ; ALL: # %bb.0:
482 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
483 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
482 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
483 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
484484 ; ALL-NEXT: retq
485485 %s = shufflevector <4 x double> , <4 x double> %a, <4 x i32>
486486 ret <4 x double> %s
510510 ; AVX1: # %bb.0:
511511 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
512512 ; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
513 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
513 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
514514 ; AVX1-NEXT: retq
515515 ;
516516 ; AVX2-LABEL: shuffle_v4i64_67zz:
11111111 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
11121112 ret <4 x i32> %res
11131113 }
1114 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
1115 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
1116 ; CHECK: # %bb.0:
1117 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
1118 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1119 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,1]
1120 ; CHECK-NEXT: vzeroupper
1121 ; CHECK-NEXT: retq
1122 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32>
1114 define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
1115 ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
1116 ; CHECK: # %bb.0:
1117 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
1118 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1119 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,1]
1120 ; CHECK-NEXT: vzeroupper
1121 ; CHECK-NEXT: retq
1122 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32>
11231123 ret <4 x i32> %res
11241124 }
11251125 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
30833083 ret <4 x float> %res
30843084 }
30853085
3086 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3087 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
3088 ; CHECK: # %bb.0:
3089 ; CHECK-NEXT: vmovdqa (%rdi), %ymm2
3090 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
3091 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
3092 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3093 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3094 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
3086 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3087 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
3088 ; CHECK: # %bb.0:
3089 ; CHECK-NEXT: vmovaps (%rdi), %ymm2
3090 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
3091 ; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
3092 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3093 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3094 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
30953095 ; CHECK-NEXT: vzeroupper
30963096 ; CHECK-NEXT: retq
30973097 %vec = load <8 x float>, <8 x float>* %vp
31013101 ret <4 x float> %res
31023102 }
31033103
3104 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
3105 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
3106 ; CHECK: # %bb.0:
3107 ; CHECK-NEXT: vmovdqa (%rdi), %ymm1
3108 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
3109 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
3110 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
3111 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
3112 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
3104 define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
3105 ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
3106 ; CHECK: # %bb.0:
3107 ; CHECK-NEXT: vmovaps (%rdi), %ymm1
3108 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
3109 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
3110 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
3111 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
3112 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
31133113 ; CHECK-NEXT: vzeroupper
31143114 ; CHECK-NEXT: retq
31153115 %vec = load <8 x float>, <8 x float>* %vp
33973397 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
33983398 ret <4 x float> %res
33993399 }
3400 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3401 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3402 ; CHECK: # %bb.0:
3403 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
3404 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
3405 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,2]
3406 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
3407 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3408 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3409 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3400 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3401 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
3402 ; CHECK: # %bb.0:
3403 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
3404 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
3405 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
3406 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
3407 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3408 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3409 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
34103410 ; CHECK-NEXT: vzeroupper
34113411 ; CHECK-NEXT: retq
34123412 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32>
34153415 ret <4 x float> %res
34163416 }
34173417
3418 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3419 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3420 ; CHECK: # %bb.0:
3421 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
3422 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
3423 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,2]
3424 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
3425 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
3426 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3427 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
3418 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
3419 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
3420 ; CHECK: # %bb.0:
3421 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
3422 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
3423 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,2]
3424 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
3425 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
3426 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3427 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
34283428 ; CHECK-NEXT: vzeroupper
34293429 ; CHECK-NEXT: retq
34303430 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32>
34773477 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32>
34783478 ret <4 x float> %res
34793479 }
3480 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3481 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3482 ; CHECK: # %bb.0:
3483 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
3484 ; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm3
3485 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
3486 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
3487 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
3488 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3489 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3490 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
3480 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
3481 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
3482 ; CHECK: # %bb.0:
3483 ; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
3484 ; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
3485 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3486 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
3487 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
3488 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3489 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
3490 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
34913491 ; CHECK-NEXT: vzeroupper
34923492 ; CHECK-NEXT: retq
34933493 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32>
34963496 ret <4 x float> %res
34973497 }
34983498
3499 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3500 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3501 ; CHECK: # %bb.0:
3502 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3503 ; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm2
3504 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
3505 ; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
3506 ; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
3507 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
3508 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3509 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
3499 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
3500 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
3501 ; CHECK: # %bb.0:
3502 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
3503 ; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
3504 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
3505 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
3506 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
3507 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
3508 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
3509 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
35103510 ; CHECK-NEXT: vzeroupper
35113511 ; CHECK-NEXT: retq
35123512 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32>
36993699 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32>
37003700 ret <4 x float> %res
37013701 }
3702 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3703 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3704 ; CHECK: # %bb.0:
3705 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
3706 ; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
3707 ; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3]
3708 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm2
3709 ; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
3710 ; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
3711 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3712 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3713 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
3702 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
3703 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
3704 ; CHECK: # %bb.0:
3705 ; CHECK-NEXT: vmovaps (%rdi), %zmm2
3706 ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
3707 ; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,2,3,3]
3708 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2
3709 ; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,1,2,3]
3710 ; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
3711 ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
3712 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
3713 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
37143714 ; CHECK-NEXT: vzeroupper
37153715 ; CHECK-NEXT: retq
37163716 %vec = load <16 x float>, <16 x float>* %vp
37203720 ret <4 x float> %res
37213721 }
37223722
3723 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
3724 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3725 ; CHECK: # %bb.0:
3726 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
3727 ; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
3728 ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
3729 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
3730 ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
3731 ; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
3732 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
3733 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
3734 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
3723 define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
3724 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
3725 ; CHECK: # %bb.0:
3726 ; CHECK-NEXT: vmovaps (%rdi), %zmm1
3727 ; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
3728 ; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,3]
3729 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1
3730 ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
3731 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
3732 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
3733 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
3734 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
37353735 ; CHECK-NEXT: vzeroupper
37363736 ; CHECK-NEXT: retq
37373737 %vec = load <16 x float>, <16 x float>* %vp
32553255 define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
32563256 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
32573257 ; CHECK: ## %bb.0:
3258 ; CHECK-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c]
3259 ; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
3258 ; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0]
3259 ; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
32603260 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
32613261 ; CHECK-NEXT: vmovaps %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0xd0]
32623262 ; CHECK-NEXT: vmovaps %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc8]
7171 ;
7272 ; SSE41-LABEL: test_negative_zero_2:
7373 ; SSE41: # %bb.0: # %entry
74 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
74 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
7575 ; SSE41-NEXT: retq
7676 entry:
7777 %0 = extractelement <2 x double> %A, i32 0
1515 ;
1616 ; SSE42-LABEL: _clearupper2xi64a:
1717 ; SSE42: # %bb.0:
18 ; SSE42-NEXT: pxor %xmm1, %xmm1
19 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
18 ; SSE42-NEXT: xorps %xmm1, %xmm1
19 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
2020 ; SSE42-NEXT: retq
2121 ;
22 ; AVX1-LABEL: _clearupper2xi64a:
23 ; AVX1: # %bb.0:
24 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
25 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
26 ; AVX1-NEXT: retq
27 ;
28 ; AVX2-LABEL: _clearupper2xi64a:
29 ; AVX2: # %bb.0:
30 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
31 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
32 ; AVX2-NEXT: retq
22 ; AVX-LABEL: _clearupper2xi64a:
23 ; AVX: # %bb.0:
24 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
25 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
26 ; AVX-NEXT: retq
3327 %x0 = extractelement <2 x i64> %0, i32 0
3428 %x1 = extractelement <2 x i64> %0, i32 1
3529 %trunc0 = trunc i64 %x0 to i32
5145 ;
5246 ; SSE42-LABEL: _clearupper4xi64a:
5347 ; SSE42: # %bb.0:
54 ; SSE42-NEXT: pxor %xmm2, %xmm2
55 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
56 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
48 ; SSE42-NEXT: xorps %xmm2, %xmm2
49 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
50 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
5751 ; SSE42-NEXT: retq
5852 ;
5953 ; AVX-LABEL: _clearupper4xi64a:
672666 ;
673667 ; SSE42-LABEL: _clearupper2xi64b:
674668 ; SSE42: # %bb.0:
675 ; SSE42-NEXT: pxor %xmm1, %xmm1
676 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
669 ; SSE42-NEXT: xorps %xmm1, %xmm1
670 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
677671 ; SSE42-NEXT: retq
678672 ;
679 ; AVX1-LABEL: _clearupper2xi64b:
680 ; AVX1: # %bb.0:
681 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
682 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
683 ; AVX1-NEXT: retq
684 ;
685 ; AVX2-LABEL: _clearupper2xi64b:
686 ; AVX2: # %bb.0:
687 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
688 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
689 ; AVX2-NEXT: retq
673 ; AVX-LABEL: _clearupper2xi64b:
674 ; AVX: # %bb.0:
675 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
676 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
677 ; AVX-NEXT: retq
690678 %x32 = bitcast <2 x i64> %0 to <4 x i32>
691679 %r0 = insertelement <4 x i32> %x32, i32 zeroinitializer, i32 1
692680 %r1 = insertelement <4 x i32> %r0, i32 zeroinitializer, i32 3
704692 ;
705693 ; SSE42-LABEL: _clearupper4xi64b:
706694 ; SSE42: # %bb.0:
707 ; SSE42-NEXT: pxor %xmm2, %xmm2
708 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
709 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
695 ; SSE42-NEXT: xorps %xmm2, %xmm2
696 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
697 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
710698 ; SSE42-NEXT: retq
711699 ;
712700 ; AVX-LABEL: _clearupper4xi64b:
16381626 ;
16391627 ; SSE42-LABEL: _clearupper2xi64c:
16401628 ; SSE42: # %bb.0:
1641 ; SSE42-NEXT: pxor %xmm1, %xmm1
1642 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1629 ; SSE42-NEXT: xorps %xmm1, %xmm1
1630 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
16431631 ; SSE42-NEXT: retq
16441632 ;
1645 ; AVX1-LABEL: _clearupper2xi64c:
1646 ; AVX1: # %bb.0:
1647 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1648 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1649 ; AVX1-NEXT: retq
1650 ;
1651 ; AVX2-LABEL: _clearupper2xi64c:
1652 ; AVX2: # %bb.0:
1653 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
1654 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1655 ; AVX2-NEXT: retq
1633 ; AVX-LABEL: _clearupper2xi64c:
1634 ; AVX: # %bb.0:
1635 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1636 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1637 ; AVX-NEXT: retq
16561638 %r = and <2 x i64> , %0
16571639 ret <2 x i64> %r
16581640 }
16671649 ;
16681650 ; SSE42-LABEL: _clearupper4xi64c:
16691651 ; SSE42: # %bb.0:
1670 ; SSE42-NEXT: pxor %xmm2, %xmm2
1671 ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1672 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1652 ; SSE42-NEXT: xorps %xmm2, %xmm2
1653 ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
1654 ; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
16731655 ; SSE42-NEXT: retq
16741656 ;
16751657 ; AVX-LABEL: _clearupper4xi64c:
1414 ;
1515 ; SSE41-LABEL: insert_f64:
1616 ; SSE41: # %bb.0:
17 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
17 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1818 ; SSE41-NEXT: retq
1919 ;
2020 ; AVX-LABEL: insert_f64:
2121 ; AVX: # %bb.0:
22 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
22 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2323 ; AVX-NEXT: retq
2424 ;
2525 ; AVX512-LABEL: insert_f64:
2626 define <4 x i32> @test1(<4 x i32> %A) {
2727 ; CHECK-LABEL: test1:
2828 ; CHECK: # %bb.0:
29 ; CHECK-NEXT: pxor %xmm1, %xmm1
30 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
29 ; CHECK-NEXT: xorps %xmm1, %xmm1
30 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
3131 ; CHECK-NEXT: retq
3232 %1 = and <4 x i32> %A,
3333 ret <4 x i32> %1
3636 define <4 x i32> @test2(<4 x i32> %A) {
3737 ; CHECK-LABEL: test2:
3838 ; CHECK: # %bb.0:
39 ; CHECK-NEXT: pxor %xmm1, %xmm1
40 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
39 ; CHECK-NEXT: xorps %xmm1, %xmm1
40 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
4141 ; CHECK-NEXT: retq
4242 %1 = and <4 x i32> %A,
4343 ret <4 x i32> %1
4646 define <4 x i32> @test3(<4 x i32> %A) {
4747 ; CHECK-LABEL: test3:
4848 ; CHECK: # %bb.0:
49 ; CHECK-NEXT: pxor %xmm1, %xmm1
50 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
49 ; CHECK-NEXT: xorps %xmm1, %xmm1
50 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
5151 ; CHECK-NEXT: retq
5252 %1 = and <4 x i32> %A,
5353 ret <4 x i32> %1
5656 define <4 x i32> @test4(<4 x i32> %A) {
5757 ; CHECK-LABEL: test4:
5858 ; CHECK: # %bb.0:
59 ; CHECK-NEXT: pxor %xmm1, %xmm1
60 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
59 ; CHECK-NEXT: xorps %xmm1, %xmm1
60 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
6161 ; CHECK-NEXT: retq
6262 %1 = and <4 x i32> %A,
6363 ret <4 x i32> %1
6666 define <4 x i32> @test5(<4 x i32> %A) {
6767 ; CHECK-LABEL: test5:
6868 ; CHECK: # %bb.0:
69 ; CHECK-NEXT: pxor %xmm1, %xmm1
70 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
69 ; CHECK-NEXT: xorps %xmm1, %xmm1
70 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
7171 ; CHECK-NEXT: retq
7272 %1 = and <4 x i32> %A,
7373 ret <4 x i32> %1
7676 define <4 x i32> @test6(<4 x i32> %A) {
7777 ; CHECK-LABEL: test6:
7878 ; CHECK: # %bb.0:
79 ; CHECK-NEXT: pxor %xmm1, %xmm1
80 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
79 ; CHECK-NEXT: xorps %xmm1, %xmm1
80 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
8181 ; CHECK-NEXT: retq
8282 %1 = and <4 x i32> %A,
8383 ret <4 x i32> %1
8686 define <4 x i32> @test7(<4 x i32> %A) {
8787 ; CHECK-LABEL: test7:
8888 ; CHECK: # %bb.0:
89 ; CHECK-NEXT: pxor %xmm1, %xmm1
90 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
89 ; CHECK-NEXT: xorps %xmm1, %xmm1
90 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9191 ; CHECK-NEXT: retq
9292 %1 = and <4 x i32> %A,
9393 ret <4 x i32> %1
9696 define <4 x i32> @test8(<4 x i32> %A) {
9797 ; CHECK-LABEL: test8:
9898 ; CHECK: # %bb.0:
99 ; CHECK-NEXT: pxor %xmm1, %xmm1
100 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
99 ; CHECK-NEXT: xorps %xmm1, %xmm1
100 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
101101 ; CHECK-NEXT: retq
102102 %1 = and <4 x i32> %A,
103103 ret <4 x i32> %1
115115 define <4 x i32> @test10(<4 x i32> %A) {
116116 ; CHECK-LABEL: test10:
117117 ; CHECK: # %bb.0:
118 ; CHECK-NEXT: pxor %xmm1, %xmm1
119 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
118 ; CHECK-NEXT: xorps %xmm1, %xmm1
119 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
120120 ; CHECK-NEXT: retq
121121 %1 = and <4 x i32> %A,
122122 ret <4 x i32> %1
125125 define <4 x i32> @test11(<4 x i32> %A) {
126126 ; CHECK-LABEL: test11:
127127 ; CHECK: # %bb.0:
128 ; CHECK-NEXT: pxor %xmm1, %xmm1
129 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
128 ; CHECK-NEXT: xorps %xmm1, %xmm1
129 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
130130 ; CHECK-NEXT: retq
131131 %1 = and <4 x i32> %A,
132132 ret <4 x i32> %1
135135 define <4 x i32> @test12(<4 x i32> %A) {
136136 ; CHECK-LABEL: test12:
137137 ; CHECK: # %bb.0:
138 ; CHECK-NEXT: pxor %xmm1, %xmm1
139 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
138 ; CHECK-NEXT: xorps %xmm1, %xmm1
139 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
140140 ; CHECK-NEXT: retq
141141 %1 = and <4 x i32> %A,
142142 ret <4 x i32> %1
145145 define <4 x i32> @test13(<4 x i32> %A) {
146146 ; CHECK-LABEL: test13:
147147 ; CHECK: # %bb.0:
148 ; CHECK-NEXT: pxor %xmm1, %xmm1
149 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
148 ; CHECK-NEXT: xorps %xmm1, %xmm1
149 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
150150 ; CHECK-NEXT: retq
151151 %1 = and <4 x i32> %A,
152152 ret <4 x i32> %1
155155 define <4 x i32> @test14(<4 x i32> %A) {
156156 ; CHECK-LABEL: test14:
157157 ; CHECK: # %bb.0:
158 ; CHECK-NEXT: pxor %xmm1, %xmm1
159 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
158 ; CHECK-NEXT: xorps %xmm1, %xmm1
159 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
160160 ; CHECK-NEXT: retq
161161 %1 = and <4 x i32> %A,
162162 ret <4 x i32> %1
165165 define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) {
166166 ; CHECK-LABEL: test15:
167167 ; CHECK: # %bb.0:
168 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
168 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
169169 ; CHECK-NEXT: retq
170170 %1 = and <4 x i32> %A,
171171 %2 = and <4 x i32> %B,
176176 define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) {
177177 ; CHECK-LABEL: test16:
178178 ; CHECK: # %bb.0:
179 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
179 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
180180 ; CHECK-NEXT: retq
181181 %1 = and <4 x i32> %A,
182182 %2 = and <4 x i32> %B,
187187 define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) {
188188 ; CHECK-LABEL: test17:
189189 ; CHECK: # %bb.0:
190 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
190 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
191191 ; CHECK-NEXT: retq
192192 %1 = and <4 x i32> %A,
193193 %2 = and <4 x i32> %B,
2323 define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
2424 ; CHECK-LABEL: test1:
2525 ; CHECK: # %bb.0:
26 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
26 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2727 ; CHECK-NEXT: retq
2828 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32>
2929 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32>
3535 define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
3636 ; CHECK-LABEL: test2:
3737 ; CHECK: # %bb.0:
38 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
38 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3939 ; CHECK-NEXT: retq
4040 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
4141 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32>
4747 define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
4848 ; CHECK-LABEL: test3:
4949 ; CHECK: # %bb.0:
50 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
50 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5151 ; CHECK-NEXT: retq
5252 %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32>
5353 %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32>
5959 define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
6060 ; CHECK-LABEL: test4:
6161 ; CHECK: # %bb.0:
62 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
62 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
6363 ; CHECK-NEXT: retq
6464 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
6565 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32>
7171 define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
7272 ; CHECK-LABEL: test5:
7373 ; CHECK: # %bb.0:
74 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
74 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
7575 ; CHECK-NEXT: retq
7676 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
7777 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32>
8383 define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
8484 ; CHECK-LABEL: test6:
8585 ; CHECK: # %bb.0:
86 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
86 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
8787 ; CHECK-NEXT: retq
8888 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
8989 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32>
9595 define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
9696 ; CHECK-LABEL: test7:
9797 ; CHECK: # %bb.0:
98 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
98 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
9999 ; CHECK-NEXT: retq
100100 %and1 = and <4 x i32> %a,
101101 %and2 = and <4 x i32> %b,
107107 define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
108108 ; CHECK-LABEL: test8:
109109 ; CHECK: # %bb.0:
110 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
110 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
111111 ; CHECK-NEXT: retq
112112 %and1 = and <2 x i64> %a,
113113 %and2 = and <2 x i64> %b,
119119 define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
120120 ; CHECK-LABEL: test9:
121121 ; CHECK: # %bb.0:
122 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
122 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
123123 ; CHECK-NEXT: retq
124124 %and1 = and <4 x i32> %a,
125125 %and2 = and <4 x i32> %b,
131131 define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
132132 ; CHECK-LABEL: test10:
133133 ; CHECK: # %bb.0:
134 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
134 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
135135 ; CHECK-NEXT: retq
136136 %and1 = and <2 x i64> %a,
137137 %and2 = and <2 x i64> %b,
143143 define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
144144 ; CHECK-LABEL: test11:
145145 ; CHECK: # %bb.0:
146 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
146 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
147147 ; CHECK-NEXT: retq
148148 %and1 = and <4 x i32> %a,
149149 %and2 = and <4 x i32> %b,
155155 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
156156 ; CHECK-LABEL: test12:
157157 ; CHECK: # %bb.0:
158 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
158 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
159159 ; CHECK-NEXT: retq
160160 %and1 = and <4 x i32> %a,
161161 %and2 = and <4 x i32> %b,
298298 define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
299299 ; CHECK-LABEL: test22:
300300 ; CHECK: # %bb.0:
301 ; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
301 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
302302 ; CHECK-NEXT: retq
303303 %bc1 = bitcast <2 x double> %a0 to <2 x i64>
304304 %bc2 = bitcast <2 x double> %a1 to <2 x i64>
328328 define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
329329 ; CHECK-LABEL: test24:
330330 ; CHECK: # %bb.0:
331 ; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
331 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
332332 ; CHECK-NEXT: retq
333333 %bc1 = bitcast <4 x float> %a0 to <2 x i64>
334334 %bc2 = bitcast <4 x float> %a1 to <2 x i64>
361361 define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
362362 ; CHECK-LABEL: test_crash:
363363 ; CHECK: # %bb.0:
364 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
364 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
365365 ; CHECK-NEXT: retq
366366 %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32>
367367 %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32>
374374 define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
375375 ; CHECK-LABEL: test2b:
376376 ; CHECK: # %bb.0:
377 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
377 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
378378 ; CHECK-NEXT: retq
379379 %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32>
380380 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32>
385385 define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
386386 ; CHECK-LABEL: test2c:
387387 ; CHECK: # %bb.0:
388 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
388 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
389389 ; CHECK-NEXT: retq
390390 %shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32>
391391 %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32>
397397 define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
398398 ; CHECK-LABEL: test2d:
399399 ; CHECK: # %bb.0:
400 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
400 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
401401 ; CHECK-NEXT: retq
402402 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
403403 %shuf2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %b, <4 x i32>
410410 define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
411411 ; CHECK-LABEL: test2e:
412412 ; CHECK: # %bb.0:
413 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
413 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
414414 ; CHECK-NEXT: retq
415415 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> , <4 x i32>
416416 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> , <4 x i32>
421421 define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
422422 ; CHECK-LABEL: test2f:
423423 ; CHECK: # %bb.0:
424 ; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
424 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
425425 ; CHECK-NEXT: retq
426426 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> , <4 x i32>
427427 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> , <4 x i32>
214214 ; SSE-NEXT: movaps %xmm0, %xmm2
215215 ; SSE-NEXT: movaps %xmm0, %xmm1
216216 ; SSE-NEXT: psrad $2, %xmm1
217 ; SSE-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
217 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
218218 ; SSE-NEXT: psrad $3, %xmm0
219219 ; SSE-NEXT: psrad $1, %xmm2
220220 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
257257 ; SSE-NEXT: movaps %xmm0, %xmm2
258258 ; SSE-NEXT: movaps %xmm0, %xmm1
259259 ; SSE-NEXT: psrad $2, %xmm1
260 ; SSE-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
260 ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
261261 ; SSE-NEXT: psrad $3, %xmm0
262262 ; SSE-NEXT: psrad $1, %xmm2
263263 ; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
6969 define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
7070 ; CHECK-LABEL: commute_fold_vblendpd_128:
7171 ; CHECK: # %bb.0:
72 ; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
72 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
7373 ; CHECK-NEXT: retq
7474 %1 = load <2 x double>, <2 x double>* %b
7575 %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
8080 define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
8181 ; CHECK-LABEL: commute_fold_vblendpd_256:
8282 ; CHECK: # %bb.0:
83 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
83 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
8484 ; CHECK-NEXT: retq
8585 %1 = load <4 x double>, <4 x double>* %b
8686 %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7)
2525 define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
2626 ; CHECK-LABEL: commute_fold_blendpd:
2727 ; CHECK: # %bb.0:
28 ; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
28 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
2929 ; CHECK-NEXT: retq
3030 %1 = load <2 x double>, <2 x double>* %b
3131 %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1)
99 define <4 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
1010 ; CHECK: pblendw $63, %xmm1, %xmm0
1111 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32>
12 ret <4 x i32> %shuffle
12 ; add forces execution domain
13 %sum = add <4 x i32> %shuffle, %shuffle
14 ret <4 x i32> %sum
1315 }
7171 define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
7272 ; X32-LABEL: uitofp_2i32_legalized:
7373 ; X32: # %bb.0:
74 ; X32-NEXT: pxor %xmm2, %xmm2
75 ; X32-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
76 ; X32-NEXT: movdqa {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15]
77 ; X32-NEXT: por %xmm0, %xmm2
74 ; X32-NEXT: xorps %xmm2, %xmm2
75 ; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
76 ; X32-NEXT: movaps {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15]
77 ; X32-NEXT: orps %xmm0, %xmm2
7878 ; X32-NEXT: subpd %xmm0, %xmm2
7979 ; X32-NEXT: cvtpd2ps %xmm2, %xmm0
8080 ; X32-NEXT: mulps %xmm1, %xmm0
8282 ;
8383 ; X64-LABEL: uitofp_2i32_legalized:
8484 ; X64: # %bb.0:
85 ; X64-NEXT: pxor %xmm2, %xmm2
86 ; X64-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
87 ; X64-NEXT: movdqa {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15]
88 ; X64-NEXT: por %xmm0, %xmm2
85 ; X64-NEXT: xorps %xmm2, %xmm2
86 ; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
87 ; X64-NEXT: movaps {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15]
88 ; X64-NEXT: orps %xmm0, %xmm2
8989 ; X64-NEXT: subpd %xmm0, %xmm2
9090 ; X64-NEXT: cvtpd2ps %xmm2, %xmm0
9191 ; X64-NEXT: mulps %xmm1, %xmm0
439439 ;
440440 ; X64AVX2-LABEL: elt1_v8f64:
441441 ; X64AVX2: # %bb.0:
442 ; X64AVX2-NEXT: vmovapd {{.*#+}} ymm1 = <42,u,2,3>
443 ; X64AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
444 ; X64AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
442 ; X64AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <42,u,2,3>
443 ; X64AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
444 ; X64AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
445445 ; X64AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
446446 ; X64AVX2-NEXT: retq
447447 ;
7676 ; AVX1: # %bb.0:
7777 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
7878 ; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
79 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
79 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
8080 ; AVX1-NEXT: retq
8181 ;
8282 ; AVX2-LABEL: insert_v4i64_01x3:
2727 ;
2828 ; SSE41-LABEL: insert_v2f64_z1:
2929 ; SSE41: # %bb.0:
30 ; SSE41-NEXT: xorpd %xmm1, %xmm1
31 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
30 ; SSE41-NEXT: xorps %xmm1, %xmm1
31 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3232 ; SSE41-NEXT: retq
3333 ;
3434 ; AVX-LABEL: insert_v2f64_z1:
3535 ; AVX: # %bb.0:
36 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
37 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
36 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
37 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3838 ; AVX-NEXT: retq
3939 %1 = insertelement <2 x double> %a, double 0.0, i32 0
4040 ret <2 x double> %1
6565 ; SSE41-LABEL: insert_v4f64_0zz3:
6666 ; SSE41: # %bb.0:
6767 ; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
68 ; SSE41-NEXT: xorpd %xmm2, %xmm2
69 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
68 ; SSE41-NEXT: xorps %xmm2, %xmm2
69 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
7070 ; SSE41-NEXT: retq
7171 ;
7272 ; AVX-LABEL: insert_v4f64_0zz3:
7373 ; AVX: # %bb.0:
74 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
75 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
74 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
75 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
7676 ; AVX-NEXT: retq
7777 %1 = insertelement <4 x double> %a, double 0.0, i32 1
7878 %2 = insertelement <4 x double> %1, double 0.0, i32 2
100100 ;
101101 ; SSE41-LABEL: insert_v2i64_z1:
102102 ; SSE41: # %bb.0:
103 ; SSE41-NEXT: pxor %xmm1, %xmm1
104 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
105 ; SSE41-NEXT: retq
106 ;
107 ; AVX1-LABEL: insert_v2i64_z1:
108 ; AVX1: # %bb.0:
109 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
110 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
111 ; AVX1-NEXT: retq
112 ;
113 ; AVX2-LABEL: insert_v2i64_z1:
114 ; AVX2: # %bb.0:
115 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
116 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
117 ; AVX2-NEXT: retq
103 ; SSE41-NEXT: xorps %xmm1, %xmm1
104 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
105 ; SSE41-NEXT: retq
106 ;
107 ; AVX-LABEL: insert_v2i64_z1:
108 ; AVX: # %bb.0:
109 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
110 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
111 ; AVX-NEXT: retq
118112 %1 = insertelement <2 x i64> %a, i64 0, i32 0
119113 ret <2 x i64> %1
120114 }
140134 ;
141135 ; SSE41-LABEL: insert_v4i64_01z3:
142136 ; SSE41: # %bb.0:
143 ; SSE41-NEXT: pxor %xmm2, %xmm2
144 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
145 ; SSE41-NEXT: retq
146 ;
147 ; AVX1-LABEL: insert_v4i64_01z3:
148 ; AVX1: # %bb.0:
149 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
150 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
151 ; AVX1-NEXT: retq
152 ;
153 ; AVX2-LABEL: insert_v4i64_01z3:
154 ; AVX2: # %bb.0:
155 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
156 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
157 ; AVX2-NEXT: retq
137 ; SSE41-NEXT: xorps %xmm2, %xmm2
138 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
139 ; SSE41-NEXT: retq
140 ;
141 ; AVX-LABEL: insert_v4i64_01z3:
142 ; AVX: # %bb.0:
143 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
144 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
145 ; AVX-NEXT: retq
158146 %1 = insertelement <4 x i64> %a, i64 0, i32 2
159147 ret <4 x i64> %1
160148 }
262250 ;
263251 ; SSE41-LABEL: insert_v4i32_01z3:
264252 ; SSE41: # %bb.0:
265 ; SSE41-NEXT: pxor %xmm1, %xmm1
266 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
267 ; SSE41-NEXT: retq
268 ;
269 ; AVX1-LABEL: insert_v4i32_01z3:
270 ; AVX1: # %bb.0:
271 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
272 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
273 ; AVX1-NEXT: retq
274 ;
275 ; AVX2-LABEL: insert_v4i32_01z3:
276 ; AVX2: # %bb.0:
277 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
278 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
279 ; AVX2-NEXT: retq
253 ; SSE41-NEXT: xorps %xmm1, %xmm1
254 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
255 ; SSE41-NEXT: retq
256 ;
257 ; AVX-LABEL: insert_v4i32_01z3:
258 ; AVX: # %bb.0:
259 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
260 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
261 ; AVX-NEXT: retq
280262 %1 = insertelement <4 x i32> %a, i32 0, i32 2
281263 ret <4 x i32> %1
282264 }
311293 ;
312294 ; SSE41-LABEL: insert_v8i32_z12345z7:
313295 ; SSE41: # %bb.0:
314 ; SSE41-NEXT: pxor %xmm2, %xmm2
315 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
316 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
296 ; SSE41-NEXT: xorps %xmm2, %xmm2
297 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
298 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
317299 ; SSE41-NEXT: retq
318300 ;
319301 ; AVX-LABEL: insert_v8i32_z12345z7:
834834 ; AVX1: ## %bb.0:
835835 ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
836836 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
837 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
837 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
838838 ; AVX1-NEXT: retq
839839 ;
840840 ; AVX2-LABEL: mload_constmask_v4i32:
962962 }
963963
964964 define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
965 ; AVX1-LABEL: mload_constmask_v4i64:
966 ; AVX1: ## %bb.0:
967 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = mem[0],ymm0[1,2],mem[3]
968 ; AVX1-NEXT: retq
969 ;
970 ; AVX2-LABEL: mload_constmask_v4i64:
971 ; AVX2: ## %bb.0:
972 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
973 ; AVX2-NEXT: retq
965 ; AVX-LABEL: mload_constmask_v4i64:
966 ; AVX: ## %bb.0:
967 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
968 ; AVX-NEXT: retq
974969 ;
975970 ; AVX512F-LABEL: mload_constmask_v4i64:
976971 ; AVX512F: ## %bb.0:
996991 define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
997992 ; AVX-LABEL: mload_constmask_v8f64:
998993 ; AVX: ## %bb.0:
999 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],mem[3]
1000 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3]
994 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
995 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
1001996 ; AVX-NEXT: retq
1002997 ;
1003998 ; AVX512F-LABEL: mload_constmask_v8f64:
128128 define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline ssp {
129129 ; AVX-LABEL: merge_4f64_f64_34z6:
130130 ; AVX: # %bb.0:
131 ; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
132 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
131 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
132 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
133133 ; AVX-NEXT: retq
134134 ;
135135 ; X32-AVX-LABEL: merge_4f64_f64_34z6:
136136 ; X32-AVX: # %bb.0:
137137 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
138 ; X32-AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
139 ; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
138 ; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
139 ; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
140140 ; X32-AVX-NEXT: retl
141141 %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
142142 %ptr1 = getelementptr inbounds double, double* %ptr, i64 4
261261 ; X32-AVX-LABEL: merge_8f32_2f32_23z5:
262262 ; X32-AVX: # %bb.0:
263263 ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
264 ; X32-AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
265 ; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
264 ; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
265 ; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
266266 ; X32-AVX-NEXT: retl
267267 %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
268268 %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
104104 ;
105105 ; AVX1-LABEL: v3i32:
106106 ; AVX1: # %bb.0:
107 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
108 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
109 ; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi)
110 ; AVX1-NEXT: vmovq %xmm1, (%rdi)
107 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
108 ; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
109 ; AVX1-NEXT: vextractps $2, %xmm0, 8(%rdi)
110 ; AVX1-NEXT: vmovlps %xmm1, (%rdi)
111111 ; AVX1-NEXT: retq
112112 ;
113113 ; AVX2-LABEL: v3i32:
120120 ;
121121 ; XOP-LABEL: v3i32:
122122 ; XOP: # %bb.0:
123 ; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
124 ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
125 ; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
126 ; XOP-NEXT: vmovq %xmm1, (%rdi)
123 ; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
124 ; XOP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
125 ; XOP-NEXT: vextractps $2, %xmm0, 8(%rdi)
126 ; XOP-NEXT: vmovlps %xmm1, (%rdi)
127127 ; XOP-NEXT: retq
128128 %r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32>
129129 store <3 x i32> %r, <3 x i32>* %p
664664 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
665665 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
666666 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
667 ; AVX1-NEXT: vmovapd %xmm0, 32(%rdi)
667 ; AVX1-NEXT: vmovaps %xmm0, 32(%rdi)
668668 ; AVX1-NEXT: vmovaps %ymm2, (%rdi)
669669 ; AVX1-NEXT: vzeroupper
670670 ; AVX1-NEXT: retq
671671 ;
672672 ; AVX2-SLOW-LABEL: v12i32:
673673 ; AVX2-SLOW: # %bb.0:
674 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
675 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
676 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
677 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
678 ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
679 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0
680 ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
681 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
682 ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
683 ; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi)
674 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
675 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
676 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
677 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
678 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
679 ; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm3, %ymm0
680 ; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
681 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
682 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdi)
683 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, 32(%rdi)
684684 ; AVX2-SLOW-NEXT: vzeroupper
685685 ; AVX2-SLOW-NEXT: retq
686686 ;
687687 ; AVX2-FAST-LABEL: v12i32:
688688 ; AVX2-FAST: # %bb.0:
689 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
690 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2
691 ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3
692 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
693 ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [3,3,7,7,7,7,6,7]
694 ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0
695 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
696 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
697 ; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi)
698 ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi)
689 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
690 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2
691 ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm3
692 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
693 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [3,3,7,7,7,7,6,7]
694 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
695 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
696 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
697 ; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rdi)
698 ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rdi)
699699 ; AVX2-FAST-NEXT: vzeroupper
700700 ; AVX2-FAST-NEXT: retq
701701 ;
710710 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
711711 ; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
712712 ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
713 ; XOP-NEXT: vmovapd %xmm0, 32(%rdi)
713 ; XOP-NEXT: vmovaps %xmm0, 32(%rdi)
714714 ; XOP-NEXT: vmovaps %ymm2, (%rdi)
715715 ; XOP-NEXT: vzeroupper
716716 ; XOP-NEXT: retq
13801380 ; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
13811381 ; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
13821382 ; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3]
1383 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm8[6,7]
1383 ; SSE42-NEXT: blendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3]
13841384 ; SSE42-NEXT: movdqa %xmm10, %xmm1
13851385 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
13861386 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1]
14001400 ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,3]
14011401 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
14021402 ; SSE42-NEXT: movdqu %xmm3, 16(%rsi)
1403 ; SSE42-NEXT: movdqu %xmm4, (%rsi)
1403 ; SSE42-NEXT: movups %xmm4, (%rsi)
14041404 ; SSE42-NEXT: movdqu %xmm5, 16(%rdx)
14051405 ; SSE42-NEXT: movdqu %xmm7, (%rdx)
14061406 ; SSE42-NEXT: movdqu %xmm2, 16(%rcx)
14211421 ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1]
14221422 ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
14231423 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
1424 ; AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3]
1424 ; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
14251425 ; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
14261426 ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
14271427 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
14411441 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
14421442 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
14431443 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1444 ; AVX1-NEXT: vmovupd %ymm4, (%rsi)
1444 ; AVX1-NEXT: vmovups %ymm4, (%rsi)
14451445 ; AVX1-NEXT: vmovups %ymm5, (%rdx)
14461446 ; AVX1-NEXT: vmovups %ymm0, (%rcx)
14471447 ; AVX1-NEXT: vzeroupper
15191519 ; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1]
15201520 ; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
15211521 ; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
1522 ; XOP-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3]
1522 ; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
15231523 ; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
15241524 ; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
15251525 ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
15391539 ; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
15401540 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
15411541 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1542 ; XOP-NEXT: vmovupd %ymm4, (%rsi)
1542 ; XOP-NEXT: vmovups %ymm4, (%rsi)
15431543 ; XOP-NEXT: vmovups %ymm5, (%rdx)
15441544 ; XOP-NEXT: vmovups %ymm0, (%rcx)
15451545 ; XOP-NEXT: vzeroupper
16731673 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
16741674 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
16751675 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1676 ; AVX1-NEXT: vmovupd %ymm0, 32(%rdi)
1677 ; AVX1-NEXT: vmovupd %ymm4, 64(%rdi)
1676 ; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
1677 ; AVX1-NEXT: vmovups %ymm4, 64(%rdi)
16781678 ; AVX1-NEXT: vmovups %ymm3, (%rdi)
16791679 ; AVX1-NEXT: vzeroupper
16801680 ; AVX1-NEXT: retq
17621762 ; XOP-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
17631763 ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
17641764 ; XOP-NEXT: vmovups %ymm0, 32(%rdi)
1765 ; XOP-NEXT: vmovupd %ymm4, 64(%rdi)
1765 ; XOP-NEXT: vmovups %ymm4, 64(%rdi)
17661766 ; XOP-NEXT: vmovups %ymm3, (%rdi)
17671767 ; XOP-NEXT: vzeroupper
17681768 ; XOP-NEXT: retq
99 ; CHECK-LABEL: foo:
1010 ; CHECK: # %bb.0: # %entry
1111 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
12 ; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3]
12 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3,4,5,6,7]
1313 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
1414 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,0]
1515 ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
66 ; X32: # %bb.0: # %BB
77 ; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
88 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9 ; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
10 ; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
9 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
10 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
1111 ; X32-NEXT: movb $1, %al
1212 ; X32-NEXT: .p2align 4, 0x90
1313 ; X32-NEXT: .LBB0_1: # %CF
2121 ; X64: # %bb.0: # %BB
2222 ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
2323 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
24 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
25 ; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
24 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
25 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
2626 ; X64-NEXT: movb $1, %al
2727 ; X64-NEXT: .p2align 4, 0x90
2828 ; X64-NEXT: .LBB0_1: # %CF
11511151 ; SSE41-NEXT: testb $1, %dil
11521152 ; SSE41-NEXT: jne .LBB63_1
11531153 ; SSE41-NEXT: # %bb.2:
1154 ; SSE41-NEXT: movapd %xmm2, %xmm1
1155 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1154 ; SSE41-NEXT: movaps %xmm2, %xmm1
1155 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11561156 ; SSE41-NEXT: retq
11571157 ; SSE41-NEXT: .LBB63_1:
11581158 ; SSE41-NEXT: addsd %xmm0, %xmm1
2323 define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
2424 ; X32-LABEL: test_mm_blend_pd:
2525 ; X32: # %bb.0:
26 ; X32-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
26 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2727 ; X32-NEXT: retl
2828 ;
2929 ; X64-LABEL: test_mm_blend_pd:
3030 ; X64: # %bb.0:
31 ; X64-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
31 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3232 ; X64-NEXT: retq
3333 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32>
3434 ret <2 x double> %res
66 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
77 ; CHECK-LABEL: test_x86_sse41_blendpd:
88 ; CHECK: ## %bb.0:
9 ; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
9 ; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1010 ; CHECK-NEXT: retl
1111 %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
1212 ret <2 x double> %res
563563 define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
564564 ; X32-LABEL: i32_shuf_XYZ0:
565565 ; X32: ## %bb.0:
566 ; X32-NEXT: pxor %xmm1, %xmm1
567 ; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
566 ; X32-NEXT: xorps %xmm1, %xmm1
567 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
568568 ; X32-NEXT: retl
569569 ;
570570 ; X64-LABEL: i32_shuf_XYZ0:
571571 ; X64: ## %bb.0:
572 ; X64-NEXT: pxor %xmm1, %xmm1
573 ; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
572 ; X64-NEXT: xorps %xmm1, %xmm1
573 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
574574 ; X64-NEXT: retq
575575 %vecext = extractelement <4 x i32> %x, i32 0
576576 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
143143 ; X32: # %bb.0:
144144 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
145145 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
146 ; X32-NEXT: vmovupd (%ecx), %xmm0
147 ; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
148 ; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
149 ; X32-NEXT: vmovapd %ymm0, (%eax)
146 ; X32-NEXT: vmovups (%ecx), %xmm0
147 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
148 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
149 ; X32-NEXT: vmovaps %ymm0, (%eax)
150150 ; X32-NEXT: vzeroupper
151151 ; X32-NEXT: retl
152152 ;
153153 ; X64-LABEL: legal_vzmovl_2i64_4i64:
154154 ; X64: # %bb.0:
155 ; X64-NEXT: vmovupd (%rdi), %xmm0
156 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
157 ; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
158 ; X64-NEXT: vmovapd %ymm0, (%rsi)
155 ; X64-NEXT: vmovups (%rdi), %xmm0
156 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
157 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
158 ; X64-NEXT: vmovaps %ymm0, (%rsi)
159159 ; X64-NEXT: vzeroupper
160160 ; X64-NEXT: retq
161161 %ld = load <2 x i64>, <2 x i64>* %in, align 8
195195 ; X32: # %bb.0:
196196 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
197197 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
198 ; X32-NEXT: vmovupd (%ecx), %xmm0
199 ; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
200 ; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
201 ; X32-NEXT: vmovapd %ymm0, (%eax)
198 ; X32-NEXT: vmovups (%ecx), %xmm0
199 ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
200 ; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
201 ; X32-NEXT: vmovaps %ymm0, (%eax)
202202 ; X32-NEXT: vzeroupper
203203 ; X32-NEXT: retl
204204 ;
205205 ; X64-LABEL: legal_vzmovl_2f64_4f64:
206206 ; X64: # %bb.0:
207 ; X64-NEXT: vmovupd (%rdi), %xmm0
208 ; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
209 ; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
210 ; X64-NEXT: vmovapd %ymm0, (%rsi)
207 ; X64-NEXT: vmovups (%rdi), %xmm0
208 ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
209 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
210 ; X64-NEXT: vmovaps %ymm0, (%rsi)
211211 ; X64-NEXT: vzeroupper
212212 ; X64-NEXT: retq
213213 %ld = load <2 x double>, <2 x double>* %in, align 8
7575 ;
7676 ; SSE41-LABEL: vsel_4xi8:
7777 ; SSE41: # %bb.0: # %entry
78 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
79 ; SSE41-NEXT: retq
80 ;
81 ; AVX1-LABEL: vsel_4xi8:
82 ; AVX1: # %bb.0: # %entry
83 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
84 ; AVX1-NEXT: retq
85 ;
86 ; AVX2-LABEL: vsel_4xi8:
87 ; AVX2: # %bb.0: # %entry
88 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
89 ; AVX2-NEXT: retq
78 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
79 ; SSE41-NEXT: retq
80 ;
81 ; AVX-LABEL: vsel_4xi8:
82 ; AVX: # %bb.0: # %entry
83 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
84 ; AVX-NEXT: retq
9085 entry:
9186 %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2
9287 ret <4 x i8> %vsel
109104 ;
110105 ; SSE41-LABEL: vsel_4xi16:
111106 ; SSE41: # %bb.0: # %entry
112 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
113 ; SSE41-NEXT: retq
114 ;
115 ; AVX1-LABEL: vsel_4xi16:
116 ; AVX1: # %bb.0: # %entry
117 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
118 ; AVX1-NEXT: retq
119 ;
120 ; AVX2-LABEL: vsel_4xi16:
121 ; AVX2: # %bb.0: # %entry
122 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
123 ; AVX2-NEXT: retq
107 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
108 ; SSE41-NEXT: retq
109 ;
110 ; AVX-LABEL: vsel_4xi16:
111 ; AVX: # %bb.0: # %entry
112 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
113 ; AVX-NEXT: retq
124114 entry:
125115 %vsel = select <4 x i1> , <4 x i16> %v1, <4 x i16> %v2
126116 ret <4 x i16> %vsel
143133 ;
144134 ; SSE41-LABEL: vsel_i32:
145135 ; SSE41: # %bb.0: # %entry
146 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
147 ; SSE41-NEXT: retq
148 ;
149 ; AVX1-LABEL: vsel_i32:
150 ; AVX1: # %bb.0: # %entry
151 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
152 ; AVX1-NEXT: retq
153 ;
154 ; AVX2-LABEL: vsel_i32:
155 ; AVX2: # %bb.0: # %entry
156 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
157 ; AVX2-NEXT: retq
136 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
137 ; SSE41-NEXT: retq
138 ;
139 ; AVX-LABEL: vsel_i32:
140 ; AVX: # %bb.0: # %entry
141 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
142 ; AVX-NEXT: retq
158143 entry:
159144 %vsel = select <4 x i1> , <4 x i32> %v1, <4 x i32> %v2
160145 ret <4 x i32> %vsel
175160 ;
176161 ; SSE41-LABEL: vsel_double:
177162 ; SSE41: # %bb.0: # %entry
178 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
163 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
179164 ; SSE41-NEXT: retq
180165 ;
181166 ; AVX-LABEL: vsel_double:
182167 ; AVX: # %bb.0: # %entry
183 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
168 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
184169 ; AVX-NEXT: retq
185170 entry:
186171 %vsel = select <2 x i1> , <2 x double> %v1, <2 x double> %v2
202187 ;
203188 ; SSE41-LABEL: vsel_i64:
204189 ; SSE41: # %bb.0: # %entry
205 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
206 ; SSE41-NEXT: retq
207 ;
208 ; AVX1-LABEL: vsel_i64:
209 ; AVX1: # %bb.0: # %entry
210 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
211 ; AVX1-NEXT: retq
212 ;
213 ; AVX2-LABEL: vsel_i64:
214 ; AVX2: # %bb.0: # %entry
215 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
216 ; AVX2-NEXT: retq
190 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
191 ; SSE41-NEXT: retq
192 ;
193 ; AVX-LABEL: vsel_i64:
194 ; AVX: # %bb.0: # %entry
195 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
196 ; AVX-NEXT: retq
217197 entry:
218198 %vsel = select <2 x i1> , <2 x i64> %v1, <2 x i64> %v2
219199 ret <2 x i64> %vsel
341321 ;
342322 ; SSE41-LABEL: vsel_i328:
343323 ; SSE41: # %bb.0: # %entry
344 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
345 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5,6,7]
324 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
325 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
346326 ; SSE41-NEXT: retq
347327 ;
348328 ; AVX-LABEL: vsel_i328:
377357 ;
378358 ; SSE41-LABEL: vsel_double8:
379359 ; SSE41: # %bb.0: # %entry
380 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
381 ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
360 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
361 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3]
382362 ; SSE41-NEXT: movaps %xmm5, %xmm1
383363 ; SSE41-NEXT: movaps %xmm7, %xmm3
384364 ; SSE41-NEXT: retq
385365 ;
386366 ; AVX-LABEL: vsel_double8:
387367 ; AVX: # %bb.0: # %entry
388 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
389 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
368 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
369 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
390370 ; AVX-NEXT: retq
391371 entry:
392372 %vsel = select <8 x i1> , <8 x double> %v1, <8 x double> %v2
416396 ;
417397 ; SSE41-LABEL: vsel_i648:
418398 ; SSE41: # %bb.0: # %entry
419 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
420 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
399 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
400 ; SSE41-NEXT: blendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3]
421401 ; SSE41-NEXT: movaps %xmm5, %xmm1
422402 ; SSE41-NEXT: movaps %xmm7, %xmm3
423403 ; SSE41-NEXT: retq
424404 ;
425 ; AVX1-LABEL: vsel_i648:
426 ; AVX1: # %bb.0: # %entry
427 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
428 ; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
429 ; AVX1-NEXT: retq
430 ;
431 ; AVX2-LABEL: vsel_i648:
432 ; AVX2: # %bb.0: # %entry
433 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
434 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
435 ; AVX2-NEXT: retq
405 ; AVX-LABEL: vsel_i648:
406 ; AVX: # %bb.0: # %entry
407 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
408 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
409 ; AVX-NEXT: retq
436410 entry:
437411 %vsel = select <8 x i1> , <8 x i64> %v1, <8 x i64> %v2
438412 ret <8 x i64> %vsel
457431 ;
458432 ; SSE41-LABEL: vsel_double4:
459433 ; SSE41: # %bb.0: # %entry
460 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
461 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1]
434 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
435 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
462436 ; SSE41-NEXT: retq
463437 ;
464438 ; AVX-LABEL: vsel_double4:
465439 ; AVX: # %bb.0: # %entry
466 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
440 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
467441 ; AVX-NEXT: retq
468442 entry:
469443 %vsel = select <4 x i1> , <4 x double> %v1, <4 x double> %v2
567541 ;
568542 ; SSE41-LABEL: constant_blendvpd_avx:
569543 ; SSE41: # %bb.0: # %entry
570 ; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1]
544 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
571545 ; SSE41-NEXT: movaps %xmm2, %xmm0
572546 ; SSE41-NEXT: retq
573547 ;
574548 ; AVX-LABEL: constant_blendvpd_avx:
575549 ; AVX: # %bb.0: # %entry
576 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
550 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
577551 ; AVX-NEXT: retq
578552 entry:
579553 %select = select <4 x i1> , <4 x double> %xy, <4 x double> %ab
751725 ;
752726 ; SSE41-LABEL: blend_shufflevector_4xdouble:
753727 ; SSE41: # %bb.0: # %entry
754 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
728 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
755729 ; SSE41-NEXT: retq
756730 ;
757731 ; AVX-LABEL: blend_shufflevector_4xdouble:
758732 ; AVX: # %bb.0: # %entry
759 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
733 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
760734 ; AVX-NEXT: retq
761735 entry:
762736 %select = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
778752 ;
779753 ; SSE41-LABEL: blend_shufflevector_4xi64:
780754 ; SSE41: # %bb.0: # %entry
781 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
755 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
782756 ; SSE41-NEXT: movaps %xmm3, %xmm1
783757 ; SSE41-NEXT: retq
784758 ;
785 ; AVX1-LABEL: blend_shufflevector_4xi64:
786 ; AVX1: # %bb.0: # %entry
787 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
788 ; AVX1-NEXT: retq
789 ;
790 ; AVX2-LABEL: blend_shufflevector_4xi64:
791 ; AVX2: # %bb.0: # %entry
792 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
793 ; AVX2-NEXT: retq
759 ; AVX-LABEL: blend_shufflevector_4xi64:
760 ; AVX: # %bb.0: # %entry
761 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
762 ; AVX-NEXT: retq
794763 entry:
795764 %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32>
796765 ret <4 x i64> %select
7171 ; X32-AVX-NEXT: subl $384, %esp # imm = 0x180
7272 ; X32-AVX-NEXT: movl 40(%ebp), %ecx
7373 ; X32-AVX-NEXT: vbroadcastsd 32(%ebp), %ymm0
74 ; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
75 ; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
76 ; X32-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%esp)
77 ; X32-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%esp)
78 ; X32-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%esp)
79 ; X32-AVX-NEXT: vmovapd %ymm0, {{[0-9]+}}(%esp)
80 ; X32-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%esp)
81 ; X32-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%esp)
82 ; X32-AVX-NEXT: vmovapd %ymm1, (%esp)
83 ; X32-AVX-NEXT: vmovapd %ymm0, {{[0-9]+}}(%esp)
74 ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
75 ; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
76 ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
77 ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
78 ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
79 ; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
80 ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
81 ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
82 ; X32-AVX-NEXT: vmovaps %ymm1, (%esp)
83 ; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
8484 ; X32-AVX-NEXT: leal (%ecx,%ecx), %eax
8585 ; X32-AVX-NEXT: andl $31, %eax
8686 ; X32-AVX-NEXT: movl 128(%esp,%eax,4), %eax
100100 ; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100
101101 ; X64-AVX-NEXT: # kill: def %edi killed %edi def %rdi
102102 ; X64-AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,1,2,3]
103 ; X64-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
104 ; X64-AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
105 ; X64-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%rsp)
106 ; X64-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%rsp)
107 ; X64-AVX-NEXT: vmovapd %ymm1, (%rsp)
108 ; X64-AVX-NEXT: vmovapd %ymm0, {{[0-9]+}}(%rsp)
103 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
104 ; X64-AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
105 ; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
106 ; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
107 ; X64-AVX-NEXT: vmovaps %ymm1, (%rsp)
108 ; X64-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
109109 ; X64-AVX-NEXT: andl $15, %edi
110110 ; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax
111111 ; X64-AVX-NEXT: movq %rbp, %rsp
239239 ;
240240 ; SSE41-LABEL: shuffle_v2f64_03:
241241 ; SSE41: # %bb.0:
242 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
242 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
243243 ; SSE41-NEXT: retq
244244 ;
245245 ; AVX1-LABEL: shuffle_v2f64_03:
246246 ; AVX1: # %bb.0:
247 ; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
247 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
248248 ; AVX1-NEXT: retq
249249 ;
250250 ; AVX2-LABEL: shuffle_v2f64_03:
251251 ; AVX2: # %bb.0:
252 ; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
252 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
253253 ; AVX2-NEXT: retq
254254 ;
255255 ; AVX512VL-LABEL: shuffle_v2f64_03:
277277 ;
278278 ; SSE41-LABEL: shuffle_v2f64_21:
279279 ; SSE41: # %bb.0:
280 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
280 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
281281 ; SSE41-NEXT: retq
282282 ;
283283 ; AVX1-LABEL: shuffle_v2f64_21:
284284 ; AVX1: # %bb.0:
285 ; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
285 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
286286 ; AVX1-NEXT: retq
287287 ;
288288 ; AVX2-LABEL: shuffle_v2f64_21:
289289 ; AVX2: # %bb.0:
290 ; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
290 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
291291 ; AVX2-NEXT: retq
292292 ;
293293 ; AVX512VL-LABEL: shuffle_v2f64_21:
388388 ;
389389 ; SSE41-LABEL: shuffle_v2i64_03:
390390 ; SSE41: # %bb.0:
391 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
392 ; SSE41-NEXT: retq
393 ;
394 ; AVX1-LABEL: shuffle_v2i64_03:
395 ; AVX1: # %bb.0:
396 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
397 ; AVX1-NEXT: retq
398 ;
399 ; AVX2-LABEL: shuffle_v2i64_03:
400 ; AVX2: # %bb.0:
401 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
402 ; AVX2-NEXT: retq
403 ;
404 ; AVX512VL-LABEL: shuffle_v2i64_03:
405 ; AVX512VL: # %bb.0:
406 ; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
407 ; AVX512VL-NEXT: retq
391 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
392 ; SSE41-NEXT: retq
393 ;
394 ; AVX-LABEL: shuffle_v2i64_03:
395 ; AVX: # %bb.0:
396 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
397 ; AVX-NEXT: retq
408398 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32>
409399 ret <2 x i64> %shuffle
410400 }
429419 ;
430420 ; SSE41-LABEL: shuffle_v2i64_03_copy:
431421 ; SSE41: # %bb.0:
432 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
433 ; SSE41-NEXT: movdqa %xmm1, %xmm0
434 ; SSE41-NEXT: retq
435 ;
436 ; AVX1-LABEL: shuffle_v2i64_03_copy:
437 ; AVX1: # %bb.0:
438 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5,6,7]
439 ; AVX1-NEXT: retq
440 ;
441 ; AVX2-LABEL: shuffle_v2i64_03_copy:
442 ; AVX2: # %bb.0:
443 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
444 ; AVX2-NEXT: retq
445 ;
446 ; AVX512VL-LABEL: shuffle_v2i64_03_copy:
447 ; AVX512VL: # %bb.0:
448 ; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
449 ; AVX512VL-NEXT: retq
422 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
423 ; SSE41-NEXT: movaps %xmm1, %xmm0
424 ; SSE41-NEXT: retq
425 ;
426 ; AVX-LABEL: shuffle_v2i64_03_copy:
427 ; AVX: # %bb.0:
428 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
429 ; AVX-NEXT: retq
450430 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32>
451431 ret <2 x i64> %shuffle
452432 }
585565 ;
586566 ; SSE41-LABEL: shuffle_v2i64_21:
587567 ; SSE41: # %bb.0:
588 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
589 ; SSE41-NEXT: retq
590 ;
591 ; AVX1-LABEL: shuffle_v2i64_21:
592 ; AVX1: # %bb.0:
593 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
594 ; AVX1-NEXT: retq
595 ;
596 ; AVX2-LABEL: shuffle_v2i64_21:
597 ; AVX2: # %bb.0:
598 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
599 ; AVX2-NEXT: retq
600 ;
601 ; AVX512VL-LABEL: shuffle_v2i64_21:
602 ; AVX512VL: # %bb.0:
603 ; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
604 ; AVX512VL-NEXT: retq
568 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
569 ; SSE41-NEXT: retq
570 ;
571 ; AVX-LABEL: shuffle_v2i64_21:
572 ; AVX: # %bb.0:
573 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
574 ; AVX-NEXT: retq
605575 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32>
606576 ret <2 x i64> %shuffle
607577 }
626596 ;
627597 ; SSE41-LABEL: shuffle_v2i64_21_copy:
628598 ; SSE41: # %bb.0:
629 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
630 ; SSE41-NEXT: movdqa %xmm1, %xmm0
631 ; SSE41-NEXT: retq
632 ;
633 ; AVX1-LABEL: shuffle_v2i64_21_copy:
634 ; AVX1: # %bb.0:
635 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5,6,7]
636 ; AVX1-NEXT: retq
637 ;
638 ; AVX2-LABEL: shuffle_v2i64_21_copy:
639 ; AVX2: # %bb.0:
640 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
641 ; AVX2-NEXT: retq
642 ;
643 ; AVX512VL-LABEL: shuffle_v2i64_21_copy:
644 ; AVX512VL: # %bb.0:
645 ; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
646 ; AVX512VL-NEXT: retq
599 ; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
600 ; SSE41-NEXT: movaps %xmm1, %xmm0
601 ; SSE41-NEXT: retq
602 ;
603 ; AVX-LABEL: shuffle_v2i64_21_copy:
604 ; AVX: # %bb.0:
605 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
606 ; AVX-NEXT: retq
647607 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32>
648608 ret <2 x i64> %shuffle
649609 }
801761 ;
802762 ; SSE41-LABEL: shuffle_v2i64_z1:
803763 ; SSE41: # %bb.0:
804 ; SSE41-NEXT: pxor %xmm1, %xmm1
805 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
764 ; SSE41-NEXT: xorps %xmm1, %xmm1
765 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
806766 ; SSE41-NEXT: retq
807767 ;
808768 ; AVX1-LABEL: shuffle_v2i64_z1:
809769 ; AVX1: # %bb.0:
810 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
811 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
770 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
771 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
812772 ; AVX1-NEXT: retq
813773 ;
814774 ; AVX2-LABEL: shuffle_v2i64_z1:
918878 ;
919879 ; SSE41-LABEL: shuffle_v2f64_z1:
920880 ; SSE41: # %bb.0:
921 ; SSE41-NEXT: xorpd %xmm1, %xmm1
922 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
881 ; SSE41-NEXT: xorps %xmm1, %xmm1
882 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
923883 ; SSE41-NEXT: retq
924884 ;
925885 ; AVX1-LABEL: shuffle_v2f64_z1:
926886 ; AVX1: # %bb.0:
927 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
928 ; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
887 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
888 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
929889 ; AVX1-NEXT: retq
930890 ;
931891 ; AVX2-LABEL: shuffle_v2f64_z1:
932892 ; AVX2: # %bb.0:
933 ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
934 ; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
893 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
894 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
935895 ; AVX2-NEXT: retq
936896 ;
937897 ; AVX512VL-LABEL: shuffle_v2f64_z1:
938898 ; AVX512VL: # %bb.0:
939899 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
940 ; AVX512VL-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
900 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
941901 ; AVX512VL-NEXT: retq
942902 %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32>
943903 ret <2 x double> %shuffle
992952 ;
993953 ; SSE41-LABEL: shuffle_v2i64_bitcast_z123:
994954 ; SSE41: # %bb.0:
995 ; SSE41-NEXT: pxor %xmm1, %xmm1
996 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
955 ; SSE41-NEXT: xorps %xmm1, %xmm1
956 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
997957 ; SSE41-NEXT: retq
998958 ;
999959 ; AVX1-LABEL: shuffle_v2i64_bitcast_z123:
1000960 ; AVX1: # %bb.0:
1001 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1002 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
961 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
962 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1003963 ; AVX1-NEXT: retq
1004964 ;
1005965 ; AVX2-LABEL: shuffle_v2i64_bitcast_z123:
12341194 ;
12351195 ; SSE41-LABEL: insert_reg_lo_v2f64:
12361196 ; SSE41: # %bb.0:
1237 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1197 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12381198 ; SSE41-NEXT: retq
12391199 ;
12401200 ; AVX1-LABEL: insert_reg_lo_v2f64:
12411201 ; AVX1: # %bb.0:
1242 ; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1202 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12431203 ; AVX1-NEXT: retq
12441204 ;
12451205 ; AVX2-LABEL: insert_reg_lo_v2f64:
12461206 ; AVX2: # %bb.0:
1247 ; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
1207 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12481208 ; AVX2-NEXT: retq
12491209 ;
12501210 ; AVX512VL-LABEL: insert_reg_lo_v2f64:
355355 ;
356356 ; AVX1-LABEL: shuffle_v4i32_0124:
357357 ; AVX1: # %bb.0:
358 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
359 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
358 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0]
359 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
360360 ; AVX1-NEXT: retq
361361 ;
362362 ; AVX2OR512VL-LABEL: shuffle_v4i32_0124:
395395 ;
396396 ; AVX1-LABEL: shuffle_v4i32_0142:
397397 ; AVX1: # %bb.0:
398 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
399 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
400 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
398 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
399 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
400 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
401401 ; AVX1-NEXT: retq
402402 ;
403403 ; AVX2OR512VL-LABEL: shuffle_v4i32_0142:
440440 ;
441441 ; AVX1-LABEL: shuffle_v4i32_0412:
442442 ; AVX1: # %bb.0:
443 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
444 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
445 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
443 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
444 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
445 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
446446 ; AVX1-NEXT: retq
447447 ;
448448 ; AVX2OR512VL-LABEL: shuffle_v4i32_0412:
482482 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
483483 ; SSE41-NEXT: retq
484484 ;
485 ; AVX1-LABEL: shuffle_v4i32_4012:
486 ; AVX1: # %bb.0:
487 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
488 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
489 ; AVX1-NEXT: retq
490 ;
491 ; AVX2OR512VL-LABEL: shuffle_v4i32_4012:
492 ; AVX2OR512VL: # %bb.0:
493 ; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
494 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
495 ; AVX2OR512VL-NEXT: retq
485 ; AVX-LABEL: shuffle_v4i32_4012:
486 ; AVX: # %bb.0:
487 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
488 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
489 ; AVX-NEXT: retq
496490 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32>
497491 ret <4 x i32> %shuffle
498492 }
537531 ;
538532 ; AVX1-LABEL: shuffle_v4i32_0451:
539533 ; AVX1: # %bb.0:
540 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
541 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
542 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
534 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
535 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
536 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
543537 ; AVX1-NEXT: retq
544538 ;
545539 ; AVX2OR512VL-LABEL: shuffle_v4i32_0451:
593587 ;
594588 ; AVX1-LABEL: shuffle_v4i32_4015:
595589 ; AVX1: # %bb.0:
596 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
597 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
598 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
590 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
591 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
592 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
599593 ; AVX1-NEXT: retq
600594 ;
601595 ; AVX2OR512VL-LABEL: shuffle_v4i32_4015:
11901184 ;
11911185 ; SSE41-LABEL: shuffle_v4i32_4zzz:
11921186 ; SSE41: # %bb.0:
1193 ; SSE41-NEXT: pxor %xmm1, %xmm1
1194 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1187 ; SSE41-NEXT: xorps %xmm1, %xmm1
1188 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
11951189 ; SSE41-NEXT: retq
11961190 ;
11971191 ; AVX1OR2-LABEL: shuffle_v4i32_4zzz:
11981192 ; AVX1OR2: # %bb.0:
1199 ; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
1200 ; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1193 ; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
1194 ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
12011195 ; AVX1OR2-NEXT: retq
12021196 ;
12031197 ; AVX512VL-LABEL: shuffle_v4i32_4zzz:
12401234 ;
12411235 ; AVX1-LABEL: shuffle_v4i32_z4zz:
12421236 ; AVX1: # %bb.0:
1243 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1244 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1245 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
1237 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1238 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1239 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
12461240 ; AVX1-NEXT: retq
12471241 ;
12481242 ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
12491243 ; AVX2-SLOW: # %bb.0:
1250 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1251 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1252 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
1244 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1245 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1246 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
12531247 ; AVX2-SLOW-NEXT: retq
12541248 ;
12551249 ; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
12961290 ;
12971291 ; AVX1-LABEL: shuffle_v4i32_zz4z:
12981292 ; AVX1: # %bb.0:
1299 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1300 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1301 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
1293 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1294 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1295 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
13021296 ; AVX1-NEXT: retq
13031297 ;
13041298 ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
13051299 ; AVX2-SLOW: # %bb.0:
1306 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1307 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1308 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
1300 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
1301 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1302 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
13091303 ; AVX2-SLOW-NEXT: retq
13101304 ;
13111305 ; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
13661360 ;
13671361 ; AVX1-LABEL: shuffle_v4i32_z6zz:
13681362 ; AVX1: # %bb.0:
1369 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
1370 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1371 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1363 ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1364 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1365 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
13721366 ; AVX1-NEXT: retq
13731367 ;
13741368 ; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
15791573 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
15801574 ; SSE41-NEXT: retq
15811575 ;
1582 ; AVX1-LABEL: shuffle_v4i32_2456:
1583 ; AVX1: # %bb.0:
1584 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1585 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,2]
1586 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1587 ; AVX1-NEXT: retq
1588 ;
1589 ; AVX2OR512VL-LABEL: shuffle_v4i32_2456:
1590 ; AVX2OR512VL: # %bb.0:
1591 ; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
1592 ; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,2]
1593 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1594 ; AVX2OR512VL-NEXT: retq
1576 ; AVX-LABEL: shuffle_v4i32_2456:
1577 ; AVX: # %bb.0:
1578 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
1579 ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,2]
1580 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1581 ; AVX-NEXT: retq
15951582 %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32>
15961583 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32>
15971584 ret <4 x i32> %s2
17381725 ;
17391726 ; SSE41-LABEL: shuffle_v4i32_0z23:
17401727 ; SSE41: # %bb.0:
1741 ; SSE41-NEXT: pxor %xmm1, %xmm1
1742 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1743 ; SSE41-NEXT: retq
1744 ;
1745 ; AVX1-LABEL: shuffle_v4i32_0z23:
1746 ; AVX1: # %bb.0:
1747 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1748 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1749 ; AVX1-NEXT: retq
1750 ;
1751 ; AVX2OR512VL-LABEL: shuffle_v4i32_0z23:
1752 ; AVX2OR512VL: # %bb.0:
1753 ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
1754 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1755 ; AVX2OR512VL-NEXT: retq
1728 ; SSE41-NEXT: xorps %xmm1, %xmm1
1729 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1730 ; SSE41-NEXT: retq
1731 ;
1732 ; AVX-LABEL: shuffle_v4i32_0z23:
1733 ; AVX: # %bb.0:
1734 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1735 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1736 ; AVX-NEXT: retq
17561737 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
17571738 ret <4 x i32> %shuffle
17581739 }
17751756 ;
17761757 ; SSE41-LABEL: shuffle_v4i32_01z3:
17771758 ; SSE41: # %bb.0:
1778 ; SSE41-NEXT: pxor %xmm1, %xmm1
1779 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
1780 ; SSE41-NEXT: retq
1781 ;
1782 ; AVX1-LABEL: shuffle_v4i32_01z3:
1783 ; AVX1: # %bb.0:
1784 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1785 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
1786 ; AVX1-NEXT: retq
1787 ;
1788 ; AVX2OR512VL-LABEL: shuffle_v4i32_01z3:
1789 ; AVX2OR512VL: # %bb.0:
1790 ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
1791 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1792 ; AVX2OR512VL-NEXT: retq
1759 ; SSE41-NEXT: xorps %xmm1, %xmm1
1760 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1761 ; SSE41-NEXT: retq
1762 ;
1763 ; AVX-LABEL: shuffle_v4i32_01z3:
1764 ; AVX: # %bb.0:
1765 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1766 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1767 ; AVX-NEXT: retq
17931768 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
17941769 ret <4 x i32> %shuffle
17951770 }
18121787 ;
18131788 ; SSE41-LABEL: shuffle_v4i32_012z:
18141789 ; SSE41: # %bb.0:
1815 ; SSE41-NEXT: pxor %xmm1, %xmm1
1816 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1817 ; SSE41-NEXT: retq
1818 ;
1819 ; AVX1-LABEL: shuffle_v4i32_012z:
1820 ; AVX1: # %bb.0:
1821 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1822 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1823 ; AVX1-NEXT: retq
1824 ;
1825 ; AVX2OR512VL-LABEL: shuffle_v4i32_012z:
1826 ; AVX2OR512VL: # %bb.0:
1827 ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
1828 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1829 ; AVX2OR512VL-NEXT: retq
1790 ; SSE41-NEXT: xorps %xmm1, %xmm1
1791 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1792 ; SSE41-NEXT: retq
1793 ;
1794 ; AVX-LABEL: shuffle_v4i32_012z:
1795 ; AVX: # %bb.0:
1796 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1797 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1798 ; AVX-NEXT: retq
18301799 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
18311800 ret <4 x i32> %shuffle
18321801 }
18491818 ;
18501819 ; SSE41-LABEL: shuffle_v4i32_0zz3:
18511820 ; SSE41: # %bb.0:
1852 ; SSE41-NEXT: pxor %xmm1, %xmm1
1853 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
1854 ; SSE41-NEXT: retq
1855 ;
1856 ; AVX1-LABEL: shuffle_v4i32_0zz3:
1857 ; AVX1: # %bb.0:
1858 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1859 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
1860 ; AVX1-NEXT: retq
1861 ;
1862 ; AVX2OR512VL-LABEL: shuffle_v4i32_0zz3:
1863 ; AVX2OR512VL: # %bb.0:
1864 ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
1865 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1866 ; AVX2OR512VL-NEXT: retq
1821 ; SSE41-NEXT: xorps %xmm1, %xmm1
1822 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1823 ; SSE41-NEXT: retq
1824 ;
1825 ; AVX-LABEL: shuffle_v4i32_0zz3:
1826 ; AVX: # %bb.0:
1827 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1828 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1829 ; AVX-NEXT: retq
18671830 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
18681831 ret <4 x i32> %shuffle
18691832 }
20301993 ;
20311994 ; SSE41-LABEL: mask_v4i32_0127:
20321995 ; SSE41: # %bb.0:
2033 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
2034 ; SSE41-NEXT: retq
2035 ;
2036 ; AVX1-LABEL: mask_v4i32_0127:
2037 ; AVX1: # %bb.0:
2038 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
2039 ; AVX1-NEXT: retq
2040 ;
2041 ; AVX2OR512VL-LABEL: mask_v4i32_0127:
2042 ; AVX2OR512VL: # %bb.0:
2043 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2044 ; AVX2OR512VL-NEXT: retq
1996 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1997 ; SSE41-NEXT: retq
1998 ;
1999 ; AVX-LABEL: mask_v4i32_0127:
2000 ; AVX: # %bb.0:
2001 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2002 ; AVX-NEXT: retq
20452003 %1 = bitcast <4 x i32> %a to <2 x i64>
20462004 %2 = bitcast <4 x i32> %b to <2 x i64>
20472005 %3 = and <2 x i64> %1,
22332191 ;
22342192 ; SSE41-LABEL: insert_mem_lo_v4i32:
22352193 ; SSE41: # %bb.0:
2236 ; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2237 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2238 ; SSE41-NEXT: retq
2239 ;
2240 ; AVX1-LABEL: insert_mem_lo_v4i32:
2241 ; AVX1: # %bb.0:
2242 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
2243 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2244 ; AVX1-NEXT: retq
2245 ;
2246 ; AVX2OR512VL-LABEL: insert_mem_lo_v4i32:
2247 ; AVX2OR512VL: # %bb.0:
2248 ; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2249 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2250 ; AVX2OR512VL-NEXT: retq
2194 ; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
2195 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2196 ; SSE41-NEXT: retq
2197 ;
2198 ; AVX-LABEL: insert_mem_lo_v4i32:
2199 ; AVX: # %bb.0:
2200 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
2201 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2202 ; AVX-NEXT: retq
22512203 %a = load <2 x i32>, <2 x i32>* %ptr
22522204 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32>
22532205 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32>
23112263 ;
23122264 ; SSE41-LABEL: insert_reg_lo_v4f32:
23132265 ; SSE41: # %bb.0:
2314 ; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2266 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
23152267 ; SSE41-NEXT: retq
23162268 ;
23172269 ; AVX1OR2-LABEL: insert_reg_lo_v4f32:
23182270 ; AVX1OR2: # %bb.0:
2319 ; AVX1OR2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
2271 ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
23202272 ; AVX1OR2-NEXT: retq
23212273 ;
23222274 ; AVX512VL-LABEL: insert_reg_lo_v4f32:
24922492 ;
24932493 ; SSE41-LABEL: mask_v8i16_012345ef:
24942494 ; SSE41: # %bb.0:
2495 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
2496 ; SSE41-NEXT: retq
2497 ;
2498 ; AVX1-LABEL: mask_v8i16_012345ef:
2499 ; AVX1: # %bb.0:
2500 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
2501 ; AVX1-NEXT: retq
2502 ;
2503 ; AVX2OR512VL-LABEL: mask_v8i16_012345ef:
2504 ; AVX2OR512VL: # %bb.0:
2505 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2506 ; AVX2OR512VL-NEXT: retq
2495 ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2496 ; SSE41-NEXT: retq
2497 ;
2498 ; AVX-LABEL: mask_v8i16_012345ef:
2499 ; AVX: # %bb.0:
2500 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2501 ; AVX-NEXT: retq
25072502 %1 = bitcast <8 x i16> %a to <2 x i64>
25082503 %2 = bitcast <8 x i16> %b to <2 x i64>
25092504 %3 = and <2 x i64> %1,
886886 }
887887
888888 define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
889 ; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
890 ; AVX1: # %bb.0:
891 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
892 ; AVX1-NEXT: retq
893 ;
894 ; AVX2OR512VL-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
895 ; AVX2OR512VL: # %bb.0:
896 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
897 ; AVX2OR512VL-NEXT: retq
889 ; ALL-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
890 ; ALL: # %bb.0:
891 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
892 ; ALL-NEXT: retq
898893 %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32>
899894 ret <16 x i16> %shuffle
900895 }
31123107 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
31133108 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
31143109 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
3115 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
3110 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
31163111 ; AVX1-NEXT: retq
31173112 ;
31183113 ; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
44684463 }
44694464
44704465 define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i16> %a, <16 x i16> %b) {
4471 ; AVX1-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
4472 ; AVX1: # %bb.0:
4473 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
4474 ; AVX1-NEXT: retq
4475 ;
4476 ; AVX2OR512VL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
4477 ; AVX2OR512VL: # %bb.0:
4478 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4479 ; AVX2OR512VL-NEXT: retq
4466 ; ALL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
4467 ; ALL: # %bb.0:
4468 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4469 ; ALL-NEXT: retq
44804470 %alo = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32>
44814471 %bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32>
44824472 %shuf = shufflevector <8 x i16> %alo, <8 x i16> %bhi, <16 x i32>
12831283 ; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
12841284 ; AVX1: # %bb.0:
12851285 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15]
1286 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1286 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12871287 ; AVX1-NEXT: retq
12881288 ;
12891289 ; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
366366 define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
367367 ; ALL-LABEL: shuffle_v4f64_0527:
368368 ; ALL: # %bb.0:
369 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
369 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
370370 ; ALL-NEXT: retq
371371 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
372372 ret <4 x double> %shuffle
375375 define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
376376 ; ALL-LABEL: shuffle_v4f64_4163:
377377 ; ALL: # %bb.0:
378 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
378 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
379379 ; ALL-NEXT: retq
380380 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
381381 ret <4 x double> %shuffle
402402 define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
403403 ; ALL-LABEL: shuffle_v4f64_0167:
404404 ; ALL: # %bb.0:
405 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
405 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
406406 ; ALL-NEXT: retq
407407 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
408408 ret <4 x double> %shuffle
460460 ; AVX2: # %bb.0:
461461 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
462462 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
463 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
463 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
464464 ; AVX2-NEXT: retq
465465 ;
466466 ; AVX512VL-LABEL: shuffle_v4f64_0415:
587587 ;
588588 ; AVX2-SLOW-LABEL: shuffle_v4f64_1z2z:
589589 ; AVX2-SLOW: # %bb.0:
590 ; AVX2-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1
591 ; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
590 ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
591 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
592592 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
593593 ; AVX2-SLOW-NEXT: retq
594594 ;
600600 ; AVX512VL-SLOW-LABEL: shuffle_v4f64_1z2z:
601601 ; AVX512VL-SLOW: # %bb.0:
602602 ; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
603 ; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
604 ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
603 ; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
604 ; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
605605 ; AVX512VL-SLOW-NEXT: retq
606606 ;
607607 ; AVX512VL-FAST-LABEL: shuffle_v4f64_1z2z:
824824 ; AVX1: # %bb.0:
825825 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
826826 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
827 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
827 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
828828 ; AVX1-NEXT: retq
829829 ;
830830 ; AVX2-LABEL: shuffle_v4i64_0124:
914914 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
915915 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
916916 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
917 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
917 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
918918 ; AVX1-NEXT: retq
919919 ;
920920 ; AVX2-LABEL: shuffle_v4i64_4012:
13381338 ; AVX1-LABEL: insert_reg_and_zero_v4f64:
13391339 ; AVX1: # %bb.0:
13401340 ; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
1341 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1342 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
1341 ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
1342 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
13431343 ; AVX1-NEXT: retq
13441344 ;
13451345 ; AVX2-LABEL: insert_reg_and_zero_v4f64:
13461346 ; AVX2: # %bb.0:
13471347 ; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
1348 ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1349 ; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
1348 ; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
1349 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
13501350 ; AVX2-NEXT: retq
13511351 ;
13521352 ; AVX512VL-LABEL: insert_reg_and_zero_v4f64:
15111511 }
15121512
15131513 define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) {
1514 ; AVX1-LABEL: concat_v4i64_0167:
1515 ; AVX1: # %bb.0:
1516 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1517 ; AVX1-NEXT: retq
1518 ;
1519 ; AVX2-LABEL: concat_v4i64_0167:
1520 ; AVX2: # %bb.0:
1521 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1522 ; AVX2-NEXT: retq
1523 ;
1524 ; AVX512VL-LABEL: concat_v4i64_0167:
1525 ; AVX512VL: # %bb.0:
1526 ; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1527 ; AVX512VL-NEXT: retq
1514 ; ALL-LABEL: concat_v4i64_0167:
1515 ; ALL: # %bb.0:
1516 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1517 ; ALL-NEXT: retq
15281518 %a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32>
15291519 %a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32>
15301520 %shuffle64 = shufflevector <2 x i64> %a0lo, <2 x i64> %a1hi, <4 x i32>
17761766 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm2[0]
17771767 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
17781768 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
1779 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm0[0],xmm4[0]
1780 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
1769 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm0[0],xmm4[0]
1770 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
17811771 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
17821772 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
17831773 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
1784 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1774 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17851775 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
17861776 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
17871777 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
18291819 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm0[0],xmm2[0]
18301820 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
18311821 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
1832 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm5 = xmm1[0],xmm4[0]
1833 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
1822 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm4[0]
1823 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
18341824 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
18351825 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
18361826 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
1837 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1827 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
18381828 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
18391829 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
18401830 ; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
109109 ; AVX1-LABEL: shuffle_v8f32_06000000:
110110 ; AVX1: # %bb.0:
111111 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
112 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
112 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
113113 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
114114 ; AVX1-NEXT: retq
115115 ;
126126 ; AVX1-LABEL: shuffle_v8f32_70000000:
127127 ; AVX1: # %bb.0:
128128 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
129 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
129 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
130130 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
131131 ; AVX1-NEXT: retq
132132 ;
662662 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
663663 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
664664 ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
665 ; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3]
665 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
666666 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
667667 ; AVX1-NEXT: retq
668668 ;
829829 define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
830830 ; ALL-LABEL: shuffle_v8f32_3210fedc:
831831 ; ALL: # %bb.0:
832 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
832 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
833833 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
834834 ; ALL-NEXT: retq
835835 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
864864 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
865865 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
866866 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
867 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
867 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
868868 ; AVX1-NEXT: retq
869869 ;
870870 ; AVX2-LABEL: PR21138:
891891 define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
892892 ; ALL-LABEL: shuffle_v8f32_ba987654:
893893 ; ALL: # %bb.0:
894 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
894 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
895895 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
896896 ; ALL-NEXT: retq
897897 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32>
11051105 ; AVX1-LABEL: shuffle_v8i32_06000000:
11061106 ; AVX1: # %bb.0:
11071107 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
1108 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
1108 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
11091109 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
11101110 ; AVX1-NEXT: retq
11111111 ;
11221122 ; AVX1-LABEL: shuffle_v8i32_70000000:
11231123 ; AVX1: # %bb.0:
11241124 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
1125 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
1125 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
11261126 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
11271127 ; AVX1-NEXT: retq
11281128 ;
19121912 }
19131913
19141914 define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
1915 ; AVX1-LABEL: shuffle_v8i32_3210fedc:
1916 ; AVX1: # %bb.0:
1917 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
1918 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1919 ; AVX1-NEXT: retq
1920 ;
1921 ; AVX2OR512VL-LABEL: shuffle_v8i32_3210fedc:
1922 ; AVX2OR512VL: # %bb.0:
1923 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1924 ; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1925 ; AVX2OR512VL-NEXT: retq
1915 ; ALL-LABEL: shuffle_v8i32_3210fedc:
1916 ; ALL: # %bb.0:
1917 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1918 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1919 ; ALL-NEXT: retq
19261920 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
19271921 ret <8 x i32> %shuffle
19281922 }
19601954 }
19611955
19621956 define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
1963 ; AVX1-LABEL: shuffle_v8i32_ba987654:
1964 ; AVX1: # %bb.0:
1965 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1966 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1967 ; AVX1-NEXT: retq
1968 ;
1969 ; AVX2OR512VL-LABEL: shuffle_v8i32_ba987654:
1970 ; AVX2OR512VL: # %bb.0:
1971 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1972 ; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1973 ; AVX2OR512VL-NEXT: retq
1957 ; ALL-LABEL: shuffle_v8i32_ba987654:
1958 ; ALL: # %bb.0:
1959 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1960 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1961 ; ALL-NEXT: retq
19741962 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
19751963 ret <8 x i32> %shuffle
19761964 }
19771965
19781966 define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
1979 ; AVX1-LABEL: shuffle_v8i32_ba983210:
1980 ; AVX1: # %bb.0:
1981 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
1982 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1983 ; AVX1-NEXT: retq
1984 ;
1985 ; AVX2OR512VL-LABEL: shuffle_v8i32_ba983210:
1986 ; AVX2OR512VL: # %bb.0:
1987 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1988 ; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1989 ; AVX2OR512VL-NEXT: retq
1967 ; ALL-LABEL: shuffle_v8i32_ba983210:
1968 ; ALL: # %bb.0:
1969 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1970 ; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1971 ; ALL-NEXT: retq
19901972 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>
19911973 ret <8 x i32> %shuffle
19921974 }
23122294 }
23132295
23142296 define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) {
2315 ; AVX1-LABEL: concat_v8i32_0123CDEF:
2316 ; AVX1: # %bb.0:
2317 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
2318 ; AVX1-NEXT: retq
2319 ;
2320 ; AVX2OR512VL-LABEL: concat_v8i32_0123CDEF:
2321 ; AVX2OR512VL: # %bb.0:
2322 ; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2323 ; AVX2OR512VL-NEXT: retq
2297 ; ALL-LABEL: concat_v8i32_0123CDEF:
2298 ; ALL: # %bb.0:
2299 ; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2300 ; ALL-NEXT: retq
23242301 %alo = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32>
23252302 %bhi = shufflevector <8 x i32> %b, <8 x i32> undef, <4 x i32>
23262303 %shuf = shufflevector <4 x i32> %alo, <4 x i32> %bhi, <8 x i32>
24272404 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
24282405 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
24292406 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm0[0,2],xmm4[0,2]
2430 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
2407 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
24312408 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
24322409 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
24332410 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
2434 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
2411 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
24352412 ; AVX1-NEXT: vaddps %ymm0, %ymm3, %ymm0
24362413 ; AVX1-NEXT: retq
24372414 ;
24762453 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
24772454 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
24782455 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,2],xmm4[0,2]
2479 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
2456 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
24802457 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
24812458 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
24822459 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
2483 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
2460 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
24842461 ; AVX1-NEXT: vaddps %ymm0, %ymm3, %ymm0
24852462 ; AVX1-NEXT: retq
24862463 ;
25252502 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
25262503 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
25272504 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm0[0,2],xmm4[0,2]
2528 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
2505 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
25292506 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
25302507 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
25312508 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
2532 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
2509 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
25332510 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
25342511 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
25352512 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
25782555 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
25792556 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
25802557 ; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,2],xmm4[0,2]
2581 ; AVX1-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
2558 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
25822559 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
25832560 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
25842561 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3]
2585 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
2562 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
25862563 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
25872564