llvm.org GIT mirror llvm / bda8c85
[AVX-512] Simplify X86InstrInfo::copyPhysReg for 128/256-bit vectors with AVX512, but not VLX. We should use the VEX opcodes and trust the register allocator to not use the extended XMM/YMM register space. Previously we were extending to copying the whole ZMM register. The register allocator shouldn't use XMM16-31 or YMM16-31 in this configuration as the instructions to spill them aren't available. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@280648 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 3 years ago
14 changed file(s) with 112 addition(s) and 168 deletion(s). Raw diff Collapse all Expand all
44984498 if (Subtarget.hasBWI())
44994499 if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg))
45004500 return Opc;
4501 if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4502 if (Subtarget.hasVLX())
4503 return X86::VMOVAPSZ128rr;
4504 DestReg = get512BitSuperRegister(DestReg);
4505 SrcReg = get512BitSuperRegister(SrcReg);
4506 return X86::VMOVAPSZrr;
4507 }
4508 if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4509 if (Subtarget.hasVLX())
4510 return X86::VMOVAPSZ256rr;
4511 DestReg = get512BitSuperRegister(DestReg);
4512 SrcReg = get512BitSuperRegister(SrcReg);
4513 return X86::VMOVAPSZrr;
4514 }
4515 if (X86::VR512RegClass.contains(DestReg, SrcReg))
4516 return X86::VMOVAPSZrr;
45174501 if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg))
45184502 return X86::KMOVWkk;
45194503 if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) {
45344518 // First deal with the normal symmetric copies.
45354519 bool HasAVX = Subtarget.hasAVX();
45364520 bool HasAVX512 = Subtarget.hasAVX512();
4521 bool HasVLX = Subtarget.hasVLX();
45374522 unsigned Opc = 0;
45384523 if (X86::GR64RegClass.contains(DestReg, SrcReg))
45394524 Opc = X86::MOV64rr;
45554540 }
45564541 else if (X86::VR64RegClass.contains(DestReg, SrcReg))
45574542 Opc = X86::MMX_MOVQ64rr;
4543 else if (X86::VR128XRegClass.contains(DestReg, SrcReg))
4544 Opc = HasVLX ? X86::VMOVAPSZ128rr : HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4545 else if (X86::VR256XRegClass.contains(DestReg, SrcReg))
4546 Opc = HasVLX ? X86::VMOVAPSZ256rr : X86::VMOVAPSYrr;
4547 else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4548 Opc = X86::VMOVAPSZrr;
45584549 else if (HasAVX512)
45594550 Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget);
4560 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4561 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4562 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4563 Opc = X86::VMOVAPSYrr;
45644551 if (!Opc)
45654552 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
45664553
690690 FrameReg = getX86SubSuperRegister(FrameReg, 32);
691691 return FrameReg;
692692 }
693
694 unsigned llvm::get512BitSuperRegister(unsigned Reg) {
695 if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
696 return X86::ZMM0 + (Reg - X86::XMM0);
697 if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
698 return X86::ZMM0 + (Reg - X86::YMM0);
699 if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
700 return Reg;
701 llvm_unreachable("Unexpected SIMD register");
702 }
136136 unsigned getSlotSize() const { return SlotSize; }
137137 };
138138
139 //get512BitRegister - X86 utility - returns 512-bit super register
140 unsigned get512BitSuperRegister(unsigned Reg);
141
142139 } // End llvm namespace
143140
144141 #endif
334334 }
335335
336336 define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) {
337 ; KNL-LABEL: test8:
338 ; KNL: ## BB#0:
339 ; KNL-NEXT: testb $1, %dil
340 ; KNL-NEXT: jne LBB8_2
341 ; KNL-NEXT: ## BB#1:
342 ; KNL-NEXT: vmovaps %zmm1, %zmm0
343 ; KNL-NEXT: LBB8_2:
344 ; KNL-NEXT: retq
345 ;
346 ; SKX-LABEL: test8:
347 ; SKX: ## BB#0:
348 ; SKX-NEXT: testb $1, %dil
349 ; SKX-NEXT: jne LBB8_2
350 ; SKX-NEXT: ## BB#1:
351 ; SKX-NEXT: vmovaps %xmm1, %xmm0
352 ; SKX-NEXT: LBB8_2:
353 ; SKX-NEXT: retq
337 ; ALL_X64-LABEL: test8:
338 ; ALL_X64: ## BB#0:
339 ; ALL_X64-NEXT: testb $1, %dil
340 ; ALL_X64-NEXT: jne LBB8_2
341 ; ALL_X64-NEXT: ## BB#1:
342 ; ALL_X64-NEXT: vmovaps %xmm1, %xmm0
343 ; ALL_X64-NEXT: LBB8_2:
344 ; ALL_X64-NEXT: retq
354345 ;
355346 ; KNL_X32-LABEL: test8:
356347 ; KNL_X32: ## BB#0:
357348 ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp)
358349 ; KNL_X32-NEXT: jne LBB8_2
359350 ; KNL_X32-NEXT: ## BB#1:
360 ; KNL_X32-NEXT: vmovaps %zmm1, %zmm0
351 ; KNL_X32-NEXT: vmovaps %xmm1, %xmm0
361352 ; KNL_X32-NEXT: LBB8_2:
362353 ; KNL_X32-NEXT: retl
363354 %res = select i1 %cond, <16 x i8> %a1, <16 x i8> %a2
162162 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
163163 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
164164 ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1
165 ; KNL-NEXT: vmovdqa64 %zmm2, %zmm0
165 ; KNL-NEXT: vmovdqa %ymm2, %ymm0
166166 ; KNL-NEXT: retq
167167 ;
168168 ; SKX-LABEL: zext_32x8mem_to_32x16:
191191 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
192192 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
193193 ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1
194 ; KNL-NEXT: vmovdqa64 %zmm2, %zmm0
194 ; KNL-NEXT: vmovdqa %ymm2, %ymm0
195195 ; KNL-NEXT: retq
196196 ;
197197 ; SKX-LABEL: sext_32x8mem_to_32x16:
212212 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
213213 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
214214 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
215 ; KNL-NEXT: vmovdqa64 %zmm2, %zmm0
215 ; KNL-NEXT: vmovdqa %ymm2, %ymm0
216216 ; KNL-NEXT: retq
217217 ;
218218 ; SKX-LABEL: zext_32x8_to_32x16:
257257 ; KNL-NEXT: vpmovsxbw %xmm0, %ymm2
258258 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
259259 ; KNL-NEXT: vpmovsxbw %xmm0, %ymm1
260 ; KNL-NEXT: vmovdqa64 %zmm2, %zmm0
260 ; KNL-NEXT: vmovdqa %ymm2, %ymm0
261261 ; KNL-NEXT: retq
262262 ;
263263 ; SKX-LABEL: sext_32x8_to_32x16:
197197 ; CHECK: ## BB#0:
198198 ; CHECK-NEXT: andl $1, %edi
199199 ; CHECK-NEXT: kmovw %edi, %k1
200 ; CHECK-NEXT: vmovaps %zmm2, %zmm3
200 ; CHECK-NEXT: vmovaps %xmm2, %xmm3
201201 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
202202 ; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
203203 ; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
224224 ; CHECK: ## BB#0:
225225 ; CHECK-NEXT: andl $1, %edi
226226 ; CHECK-NEXT: kmovw %edi, %k1
227 ; CHECK-NEXT: vmovaps %zmm2, %zmm3
227 ; CHECK-NEXT: vmovaps %xmm2, %xmm3
228228 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
229229 ; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
230230 ; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
26802680 ; CHECK-NEXT: andl $1, %edi
26812681 ; CHECK-NEXT: kmovw %edi, %k1
26822682 ; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2683 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2683 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
26842684 ; CHECK-NEXT: retq
26852685 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 0)
26862686 ret <4 x float> %res
26922692 ; CHECK-NEXT: andl $1, %edi
26932693 ; CHECK-NEXT: kmovw %edi, %k1
26942694 ; CHECK-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2695 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2695 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
26962696 ; CHECK-NEXT: retq
26972697 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
26982698 ret <4 x float> %res
27042704 ; CHECK-NEXT: andl $1, %edi
27052705 ; CHECK-NEXT: kmovw %edi, %k1
27062706 ; CHECK-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2707 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2707 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
27082708 ; CHECK-NEXT: retq
27092709 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 2)
27102710 ret <4 x float> %res
27162716 ; CHECK-NEXT: andl $1, %edi
27172717 ; CHECK-NEXT: kmovw %edi, %k1
27182718 ; CHECK-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2719 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2719 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
27202720 ; CHECK-NEXT: retq
27212721 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 3)
27222722 ret <4 x float> %res
27282728 ; CHECK-NEXT: andl $1, %edi
27292729 ; CHECK-NEXT: kmovw %edi, %k1
27302730 ; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
2731 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2731 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
27322732 ; CHECK-NEXT: retq
27332733 %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
27342734 ret <4 x float> %res
27622762 ; CHECK-NEXT: andl $1, %edi
27632763 ; CHECK-NEXT: kmovw %edi, %k1
27642764 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2765 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
2765 ; CHECK-NEXT: vmovapd %xmm2, %xmm0
27662766 ; CHECK-NEXT: retq
27672767 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0)
27682768 ret <2 x double> %res
27742774 ; CHECK-NEXT: andl $1, %edi
27752775 ; CHECK-NEXT: kmovw %edi, %k1
27762776 ; CHECK-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2777 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
2777 ; CHECK-NEXT: vmovapd %xmm2, %xmm0
27782778 ; CHECK-NEXT: retq
27792779 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
27802780 ret <2 x double> %res
27862786 ; CHECK-NEXT: andl $1, %edi
27872787 ; CHECK-NEXT: kmovw %edi, %k1
27882788 ; CHECK-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2789 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
2789 ; CHECK-NEXT: vmovapd %xmm2, %xmm0
27902790 ; CHECK-NEXT: retq
27912791 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2)
27922792 ret <2 x double> %res
27982798 ; CHECK-NEXT: andl $1, %edi
27992799 ; CHECK-NEXT: kmovw %edi, %k1
28002800 ; CHECK-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
2801 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
2801 ; CHECK-NEXT: vmovapd %xmm2, %xmm0
28022802 ; CHECK-NEXT: retq
28032803 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3)
28042804 ret <2 x double> %res
28102810 ; CHECK-NEXT: andl $1, %edi
28112811 ; CHECK-NEXT: kmovw %edi, %k1
28122812 ; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
2813 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
2813 ; CHECK-NEXT: vmovapd %xmm2, %xmm0
28142814 ; CHECK-NEXT: retq
28152815 %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
28162816 ret <2 x double> %res
28442844 ; CHECK-NEXT: andl $1, %edi
28452845 ; CHECK-NEXT: kmovw %edi, %k1
28462846 ; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2847 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2847 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
28482848 ; CHECK-NEXT: retq
28492849 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
28502850 ret <4 x float> %res
28762876 ; CHECK-NEXT: andl $1, %edi
28772877 ; CHECK-NEXT: kmovw %edi, %k1
28782878 ; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
2879 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
2879 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
28802880 ; CHECK-NEXT: retq
28812881 %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
28822882 ret <4 x float> %res
29092909 ; CHECK-NEXT: andl $1, %edi
29102910 ; CHECK-NEXT: kmovw %edi, %k1
29112911 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
2912 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
2912 ; CHECK-NEXT: vmovapd %xmm2, %xmm0
29132913 ; CHECK-NEXT: retq
29142914 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
29152915 ret <2 x double> %res
29412941 ; CHECK-NEXT: andl $1, %edi
29422942 ; CHECK-NEXT: kmovw %edi, %k1
29432943 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
2944 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
2944 ; CHECK-NEXT: vmovapd %xmm2, %xmm0
29452945 ; CHECK-NEXT: retq
29462946 %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
29472947 ret <2 x double> %res
41904190 ; CHECK: ## BB#0:
41914191 ; CHECK-NEXT: andl $1, %edi
41924192 ; CHECK-NEXT: kmovw %edi, %k1
4193 ; CHECK-NEXT: vmovaps %zmm2, %zmm3
4193 ; CHECK-NEXT: vmovaps %xmm2, %xmm3
41944194 ; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
41954195 ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
41964196 ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
42174217 ; CHECK: ## BB#0:
42184218 ; CHECK-NEXT: andl $1, %edi
42194219 ; CHECK-NEXT: kmovw %edi, %k1
4220 ; CHECK-NEXT: vmovaps %zmm2, %zmm3
4220 ; CHECK-NEXT: vmovaps %xmm2, %xmm3
42214221 ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
42224222 ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4
42234223 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
44374437 ; CHECK: ## BB#0:
44384438 ; CHECK-NEXT: andl $1, %edi
44394439 ; CHECK-NEXT: kmovw %edi, %k1
4440 ; CHECK-NEXT: vmovapd %zmm2, %zmm3
4440 ; CHECK-NEXT: vmovapd %xmm2, %xmm3
44414441 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1}
44424442 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z}
44434443 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5
48414841 ; CHECK-NEXT: andl $1, %edi
48424842 ; CHECK-NEXT: kmovw %edi, %k1
48434843 ; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1}
4844 ; CHECK-NEXT: vmovaps %zmm2, %zmm0
4844 ; CHECK-NEXT: vmovaps %xmm2, %xmm0
48454845 ; CHECK-NEXT: retq
48464846 %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
48474847 ret <4 x float> %res
48944894 ; CHECK-NEXT: andl $1, %edi
48954895 ; CHECK-NEXT: kmovw %edi, %k1
48964896 ; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1}
4897 ; CHECK-NEXT: vmovapd %zmm2, %zmm0
4897 ; CHECK-NEXT: vmovapd %xmm2, %xmm0
48984898 ; CHECK-NEXT: retq
48994899 %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
49004900 ret <2 x double> %res
55225522 ; CHECK: ## BB#0:
55235523 ; CHECK-NEXT: andl $1, %edi
55245524 ; CHECK-NEXT: kmovw %edi, %k1
5525 ; CHECK-NEXT: vmovaps %zmm0, %zmm3
5525 ; CHECK-NEXT: vmovaps %xmm0, %xmm3
55265526 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
55275527 ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
5528 ; CHECK-NEXT: vmovaps %zmm0, %zmm5
5528 ; CHECK-NEXT: vmovaps %xmm0, %xmm5
55295529 ; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1}
55305530 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0
55315531 ; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1
55465546 ; CHECK: ## BB#0:
55475547 ; CHECK-NEXT: andl $1, %edi
55485548 ; CHECK-NEXT: kmovw %edi, %k1
5549 ; CHECK-NEXT: vmovaps %zmm0, %zmm3
5549 ; CHECK-NEXT: vmovaps %xmm0, %xmm3
55505550 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
5551 ; CHECK-NEXT: vmovaps %zmm0, %zmm4
5551 ; CHECK-NEXT: vmovaps %xmm0, %xmm4
55525552 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4
55535553 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
55545554 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
56165616 ; CHECK: ## BB#0:
56175617 ; CHECK-NEXT: andl $1, %edi
56185618 ; CHECK-NEXT: kmovw %edi, %k1
5619 ; CHECK-NEXT: vmovapd %zmm0, %zmm3
5619 ; CHECK-NEXT: vmovapd %xmm0, %xmm3
56205620 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1}
5621 ; CHECK-NEXT: vmovapd %zmm0, %zmm4
5621 ; CHECK-NEXT: vmovapd %xmm0, %xmm4
56225622 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4
56235623 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
56245624 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1}
56405640 ; CHECK: ## BB#0:
56415641 ; CHECK-NEXT: andl $1, %edi
56425642 ; CHECK-NEXT: kmovw %edi, %k1
5643 ; CHECK-NEXT: vmovapd %zmm0, %zmm3
5643 ; CHECK-NEXT: vmovapd %xmm0, %xmm3
56445644 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
56455645 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
5646 ; CHECK-NEXT: vmovapd %zmm0, %zmm5
5646 ; CHECK-NEXT: vmovapd %xmm0, %xmm5
56475647 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z}
56485648 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
56495649 ; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1
57405740 ; CHECK: ## BB#0:
57415741 ; CHECK-NEXT: andl $1, %edi
57425742 ; CHECK-NEXT: kmovw %edi, %k1
5743 ; CHECK-NEXT: vmovaps %zmm0, %zmm3
5743 ; CHECK-NEXT: vmovaps %xmm0, %xmm3
57445744 ; CHECK-NEXT: vfmadd132sd %xmm1, %xmm2, %xmm3 {%k1}
5745 ; CHECK-NEXT: vmovaps %zmm1, %zmm4
5745 ; CHECK-NEXT: vmovaps %xmm1, %xmm4
57465746 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
5747 ; CHECK-NEXT: vmovaps %zmm0, %zmm5
5747 ; CHECK-NEXT: vmovaps %xmm0, %xmm5
57485748 ; CHECK-NEXT: vfmadd132sd {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
57495749 ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
57505750 ; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
57685768 ; CHECK: ## BB#0:
57695769 ; CHECK-NEXT: andl $1, %edi
57705770 ; CHECK-NEXT: kmovw %edi, %k1
5771 ; CHECK-NEXT: vmovaps %zmm0, %zmm3
5771 ; CHECK-NEXT: vmovaps %xmm0, %xmm3
57725772 ; CHECK-NEXT: vfmadd132ss %xmm1, %xmm2, %xmm3 {%k1}
5773 ; CHECK-NEXT: vmovaps %zmm1, %zmm4
5773 ; CHECK-NEXT: vmovaps %xmm1, %xmm4
57745774 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
5775 ; CHECK-NEXT: vmovaps %zmm0, %zmm5
5775 ; CHECK-NEXT: vmovaps %xmm0, %xmm5
57765776 ; CHECK-NEXT: vfmadd132ss {rz-sae}, %xmm1, %xmm2, %xmm5 {%k1}
57775777 ; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
57785778 ; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
57965796 ; CHECK: ## BB#0:
57975797 ; CHECK-NEXT: andl $1, %edi
57985798 ; CHECK-NEXT: kmovw %edi, %k1
5799 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
5799 ; CHECK-NEXT: vmovaps %xmm1, %xmm3
58005800 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm3 {%k1} {z}
58015801 ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1 {%k1} {z}
58025802 ; CHECK-NEXT: vaddpd %xmm1, %xmm3, %xmm0
58155815 ; CHECK-NEXT: andl $1, %edi
58165816 ; CHECK-NEXT: kmovw %edi, %k1
58175817 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 {%k1} {z}
5818 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
5818 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
58195819 ; CHECK-NEXT: retq
58205820 %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
58215821 %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3)
58295829 ; CHECK: ## BB#0:
58305830 ; CHECK-NEXT: andl $1, %edi
58315831 ; CHECK-NEXT: kmovw %edi, %k1
5832 ; CHECK-NEXT: vmovaps %zmm2, %zmm3
5832 ; CHECK-NEXT: vmovaps %xmm2, %xmm3
58335833 ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
5834 ; CHECK-NEXT: vmovaps %zmm1, %zmm4
5834 ; CHECK-NEXT: vmovaps %xmm1, %xmm4
58355835 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm4
5836 ; CHECK-NEXT: vmovaps %zmm2, %zmm5
5836 ; CHECK-NEXT: vmovaps %xmm2, %xmm5
58375837 ; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
58385838 ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm0, %xmm1
58395839 ; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0
58575857 ; CHECK: ## BB#0:
58585858 ; CHECK-NEXT: andl $1, %edi
58595859 ; CHECK-NEXT: kmovw %edi, %k1
5860 ; CHECK-NEXT: vmovaps %zmm2, %zmm3
5860 ; CHECK-NEXT: vmovaps %xmm2, %xmm3
58615861 ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1}
5862 ; CHECK-NEXT: vmovaps %zmm1, %zmm4
5862 ; CHECK-NEXT: vmovaps %xmm1, %xmm4
58635863 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm4
5864 ; CHECK-NEXT: vmovaps %zmm2, %zmm5
5864 ; CHECK-NEXT: vmovaps %xmm2, %xmm5
58655865 ; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
58665866 ; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm0, %xmm1
58675867 ; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0
58845884 ; CHECK-NEXT: andl $1, %esi
58855885 ; CHECK-NEXT: kmovw %esi, %k1
58865886 ; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
5887 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
5887 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
58885888 ; CHECK-NEXT: retq
58895889 %q = load float, float* %ptr_b
58905890 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
59115911 ; CHECK: ## BB#0:
59125912 ; CHECK-NEXT: kxorw %k0, %k0, %k1
59135913 ; CHECK-NEXT: vfmadd213ss (%rdi), %xmm0, %xmm1 {%k1} {z}
5914 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
5914 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
59155915 ; CHECK-NEXT: retq
59165916 %q = load float, float* %ptr_b
59175917 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
417417 ; KNL-NEXT: cmpl %esi, %edi
418418 ; KNL-NEXT: jg LBB20_2
419419 ; KNL-NEXT: ## BB#1:
420 ; KNL-NEXT: vmovaps %zmm1, %zmm0
420 ; KNL-NEXT: vmovaps %xmm1, %xmm0
421421 ; KNL-NEXT: LBB20_2:
422422 ; KNL-NEXT: retq
423423 ;
3939 ; CHECK: ## BB#0:
4040 ; CHECK-NEXT: vcmpless %xmm0, %xmm3, %k1
4141 ; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
42 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
42 ; CHECK-NEXT: vmovaps %xmm1, %xmm0
4343 ; CHECK-NEXT: retq
4444 %cmp = fcmp oge float %a, %eps
4545 %cond = select i1 %cmp, float %c, float %b
5151 ; CHECK: ## BB#0:
5252 ; CHECK-NEXT: vcmplesd %xmm0, %xmm3, %k1
5353 ; CHECK-NEXT: vmovsd %xmm2, %xmm0, %xmm1 {%k1}
54 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
54 ; CHECK-NEXT: vmovapd %xmm1, %xmm0
5555 ; CHECK-NEXT: retq
5656 %cmp = fcmp oge double %a, %eps
5757 %cond = select i1 %cmp, double %c, double %b
358358 ; AVX512F-LABEL: _invec32xi8:
359359 ; AVX512F: # BB#0:
360360 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
361 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1
361 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
362362 ; AVX512F-NEXT: retq
363363 ;
364364 ; AVX512BW-LABEL: _invec32xi8:
373373 ; AVX512F-LABEL: _invec16xi16:
374374 ; AVX512F: # BB#0:
375375 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
376 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1
376 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
377377 ; AVX512F-NEXT: retq
378378 ;
379379 ; AVX512BW-LABEL: _invec16xi16:
290290 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
291291 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
292292 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
293 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
293 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
294294 ; KNL_64-NEXT: retq
295295 ;
296296 ; KNL_32-LABEL: test6:
300300 ; KNL_32-NEXT: kxnorw %k0, %k0, %k2
301301 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
302302 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
303 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
303 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
304304 ; KNL_32-NEXT: retl
305305 ;
306306 ; SKX-LABEL: test6:
335335 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
336336 ; KNL_64-NEXT: kmovw %k1, %k2
337337 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
338 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
338 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm2
339339 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
340340 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
341341 ; KNL_64-NEXT: retq
348348 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
349349 ; KNL_32-NEXT: kmovw %k1, %k2
350350 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
351 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
351 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm2
352352 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
353353 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
354354 ; KNL_32-NEXT: retl
850850 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
851851 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
852852 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
853 ; KNL_64-NEXT: vmovapd %zmm2, %zmm0
853 ; KNL_64-NEXT: vmovapd %ymm2, %ymm0
854854 ; KNL_64-NEXT: retq
855855 ;
856856 ; KNL_32-LABEL: test16:
867867 ; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
868868 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
869869 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
870 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
870 ; KNL_32-NEXT: vmovapd %ymm2, %ymm0
871871 ; KNL_32-NEXT: retl
872872 ;
873873 ; SKX-LABEL: test16:
904904 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
905905 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
906906 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
907 ; KNL_64-NEXT: vmovapd %zmm2, %zmm0
907 ; KNL_64-NEXT: vmovapd %xmm2, %xmm0
908908 ; KNL_64-NEXT: retq
909909 ;
910910 ; KNL_32-LABEL: test17:
917917 ; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
918918 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
919919 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
920 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
920 ; KNL_32-NEXT: vmovapd %xmm2, %xmm0
921921 ; KNL_32-NEXT: retl
922922 ;
923923 ; SKX-LABEL: test17:
11641164 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
11651165 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
11661166 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1167 ; KNL_64-NEXT: vmovaps %zmm2, %zmm0
1167 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
11681168 ; KNL_64-NEXT: retq
11691169 ;
11701170 ; KNL_32-LABEL: test22:
11801180 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
11811181 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
11821182 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1183 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1183 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
11841184 ; KNL_32-NEXT: retl
11851185 ;
11861186 ; SKX-LABEL: test22:
12251225 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
12261226 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
12271227 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1228 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
1228 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
12291229 ; KNL_64-NEXT: retq
12301230 ;
12311231 ; KNL_32-LABEL: test23:
12381238 ; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
12391239 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
12401240 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1241 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1241 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
12421242 ; KNL_32-NEXT: retl
12431243 ;
12441244 ; SKX-LABEL: test23:
12701270 ; KNL_64-NEXT: movb $3, %al
12711271 ; KNL_64-NEXT: kmovw %eax, %k1
12721272 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1273 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
1273 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
12741274 ; KNL_64-NEXT: retq
12751275 ;
12761276 ; KNL_32-LABEL: test24:
12821282 ; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
12831283 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
12841284 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1285 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
1285 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
12861286 ; KNL_32-NEXT: retl
12871287 ;
12881288 ; SKX-LABEL: test24:
13161316 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
13171317 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
13181318 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1319 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
1319 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
13201320 ; KNL_64-NEXT: retq
13211321 ;
13221322 ; KNL_32-LABEL: test25:
13291329 ; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm1, %zmm1
13301330 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
13311331 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1332 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1332 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
13331333 ; KNL_32-NEXT: retl
13341334 ;
13351335 ; SKX-LABEL: test25:
13631363 ; KNL_64-NEXT: movb $3, %al
13641364 ; KNL_64-NEXT: kmovw %eax, %k1
13651365 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1366 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
1366 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
13671367 ; KNL_64-NEXT: retq
13681368 ;
13691369 ; KNL_32-LABEL: test26:
13761376 ; KNL_32-NEXT: vpsllvq {{\.LCPI.*}}, %zmm2, %zmm2
13771377 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
13781378 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1379 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
1379 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
13801380 ; KNL_32-NEXT: retl
13811381 ;
13821382 ; SKX-LABEL: test26:
453453 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0
454454 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1
455455 ; AVX512F-NEXT: vmovups (%rdi), %zmm1 {%k1}
456 ; AVX512F-NEXT: vmovaps %zmm1, %zmm0
456 ; AVX512F-NEXT: vmovaps %ymm1, %ymm0
457457 ; AVX512F-NEXT: retq
458458 ;
459459 ; SKX-LABEL: test11a:
500500 ; AVX512F-NEXT: kshiftlw $8, %k0, %k0
501501 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1
502502 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1}
503 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
503 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
504504 ; AVX512F-NEXT: retq
505505 ;
506506 ; SKX-LABEL: test11b:
359359 ; SSE-NEXT: addq $40, %rsp
360360 ; SSE-NEXT: retq
361361 ;
362 ; AVX2-LABEL: mul_v2i64spill:
363 ; AVX2: # BB#0: # %entry
364 ; AVX2-NEXT: subq $40, %rsp
365 ; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
366 ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
367 ; AVX2-NEXT: callq foo
368 ; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
369 ; AVX2-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
370 ; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
371 ; AVX2-NEXT: vpsrlq $32, %xmm2, %xmm1
372 ; AVX2-NEXT: vmovdqa %xmm2, %xmm3
373 ; AVX2-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
374 ; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
375 ; AVX2-NEXT: vpsrlq $32, %xmm4, %xmm2
376 ; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
377 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
378 ; AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1
379 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
380 ; AVX2-NEXT: addq $40, %rsp
381 ; AVX2-NEXT: retq
382 ;
383 ; AVX512-LABEL: mul_v2i64spill:
384 ; AVX512: # BB#0: # %entry
385 ; AVX512-NEXT: subq $40, %rsp
386 ; AVX512-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
387 ; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
388 ; AVX512-NEXT: callq foo
389 ; AVX512-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
390 ; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
391 ; AVX512-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
392 ; AVX512-NEXT: vpsrlq $32, %xmm2, %xmm1
393 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3
394 ; AVX512-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
395 ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
396 ; AVX512-NEXT: vpsrlq $32, %xmm4, %xmm2
397 ; AVX512-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
398 ; AVX512-NEXT: vpsllq $32, %xmm2, %xmm2
399 ; AVX512-NEXT: vpaddq %xmm2, %xmm1, %xmm1
400 ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
401 ; AVX512-NEXT: addq $40, %rsp
402 ; AVX512-NEXT: retq
362 ; AVX-LABEL: mul_v2i64spill:
363 ; AVX: # BB#0: # %entry
364 ; AVX-NEXT: subq $40, %rsp
365 ; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
366 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
367 ; AVX-NEXT: callq foo
368 ; AVX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
369 ; AVX-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
370 ; AVX-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
371 ; AVX-NEXT: vpsrlq $32, %xmm2, %xmm1
372 ; AVX-NEXT: vmovdqa %xmm2, %xmm3
373 ; AVX-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
374 ; AVX-NEXT: vpsllq $32, %xmm1, %xmm1
375 ; AVX-NEXT: vpsrlq $32, %xmm4, %xmm2
376 ; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
377 ; AVX-NEXT: vpsllq $32, %xmm2, %xmm2
378 ; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1
379 ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
380 ; AVX-NEXT: addq $40, %rsp
381 ; AVX-NEXT: retq
403382 entry:
404383 ; Use a call to force spills.
405384 call void @foo()
167167 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2
168168 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
169169 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
170 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
170 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
171171 ; AVX512F-NEXT: retq
172172 ;
173173 ; AVX512BW-LABEL: sext_32i8_to_32i16:
144144 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
145145 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
146146 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
147 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
147 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
148148 ; AVX512F-NEXT: retq
149149 ;
150150 ; AVX512BW-LABEL: zext_32i8_to_32i16: