llvm.org GIT mirror llvm / f940424
[X86][XOP] Support for VPERMIL2PD/VPERMIL2PS 2-input shuffle instructions This patch begins adding support for lowering to the XOP VPERMIL2PD/VPERMIL2PS shuffle instructions - adding the X86ISD::VPERMIL2 opcode and cleaning up the usage. The internal llvm intrinsics were assuming the shuffle mask operand was the same type as the float/double input operands (I guess to simplify the intrinsic definitions in X86InstrXOP.td to a single value type). These needed changing to integer types (matching the clang builtin and the AMD intrinsics definitions), an auto upgrade path is added to convert old calls. Mask decoding/target shuffle support will be added in future patches. Differential Revision: http://reviews.llvm.org/D20049 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@271633 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 4 years ago
11 changed file(s) with 221 addition(s) and 92 deletion(s). Raw diff Collapse all Expand all
38723872
38733873 def int_x86_xop_vpermil2pd : GCCBuiltin<"__builtin_ia32_vpermil2pd">,
38743874 Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
3875 llvm_v2f64_ty, llvm_i8_ty],
3875 llvm_v2i64_ty, llvm_i8_ty],
38763876 [IntrNoMem]>;
38773877
38783878 def int_x86_xop_vpermil2pd_256 :
38793879 GCCBuiltin<"__builtin_ia32_vpermil2pd256">,
38803880 Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
3881 llvm_v4f64_ty, llvm_i8_ty],
3881 llvm_v4i64_ty, llvm_i8_ty],
38823882 [IntrNoMem]>;
38833883
38843884 def int_x86_xop_vpermil2ps : GCCBuiltin<"__builtin_ia32_vpermil2ps">,
38853885 Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
3886 llvm_v4f32_ty, llvm_i8_ty],
3886 llvm_v4i32_ty, llvm_i8_ty],
38873887 [IntrNoMem]>;
38883888 def int_x86_xop_vpermil2ps_256 :
38893889 GCCBuiltin<"__builtin_ia32_vpermil2ps256">,
38903890 Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
3891 llvm_v8f32_ty, llvm_i8_ty],
3891 llvm_v8i32_ty, llvm_i8_ty],
38923892 [IntrNoMem]>;
38933893
38943894 def int_x86_xop_vfrcz_pd : GCCBuiltin<"__builtin_ia32_vfrczpd">,
281281 NewFn = F;
282282 return true;
283283 }
284 // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
285 if (Name.startswith("x86.xop.vpermil2")) {
286 auto Params = F->getFunctionType()->params();
287 auto Idx = Params[2];
288 if (Idx->getScalarType()->isFloatingPointTy()) {
289 F->setName(Name + ".old");
290 unsigned IdxSize = Idx->getPrimitiveSizeInBits();
291 unsigned EltSize = Idx->getScalarSizeInBits();
292 Intrinsic::ID Permil2ID;
293 if (EltSize == 64 && IdxSize == 128)
294 Permil2ID = Intrinsic::x86_xop_vpermil2pd;
295 else if (EltSize == 32 && IdxSize == 128)
296 Permil2ID = Intrinsic::x86_xop_vpermil2ps;
297 else if (EltSize == 64 && IdxSize == 256)
298 Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
299 else
300 Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
301 NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
302 return true;
303 }
304 }
284305 break;
285306 }
286307 }
910931 CI->eraseFromParent();
911932 return;
912933
934 case Intrinsic::x86_xop_vpermil2pd:
935 case Intrinsic::x86_xop_vpermil2ps:
936 case Intrinsic::x86_xop_vpermil2pd_256:
937 case Intrinsic::x86_xop_vpermil2ps_256: {
938 SmallVector Args(CI->arg_operands().begin(),
939 CI->arg_operands().end());
940 VectorType *FltIdxTy = cast(Args[2]->getType());
941 VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy);
942 Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy);
943 CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args, Name));
944 CI->eraseFromParent();
945 return;
946 }
947
913948 case Intrinsic::x86_sse41_ptestc:
914949 case Intrinsic::x86_sse41_ptestz:
915950 case Intrinsic::x86_sse41_ptestnzc: {
2194621946 case X86ISD::VPSHL: return "X86ISD::VPSHL";
2194721947 case X86ISD::VPCOM: return "X86ISD::VPCOM";
2194821948 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
21949 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
2194921950 case X86ISD::FMADD: return "X86ISD::FMADD";
2195021951 case X86ISD::FMSUB: return "X86ISD::FMSUB";
2195121952 case X86ISD::FNMADD: return "X86ISD::FNMADD";
450450 VPCOM, VPCOMU,
451451 // XOP packed permute bytes.
452452 VPPERM,
453 // XOP two source permutation.
454 VPERMIL2,
453455
454456 // Vector multiply packed unsigned doubleword integers.
455457 PMULUDQ,
244244 SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
245245 SDTCisSameAs<0,2>,
246246 SDTCisVT<3, i8>]>>;
247
247 def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
248 SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
249 SDTCisSameAs<0,2>,
250 SDTCisSameSizeAs<0,3>,
251 SDTCisSameNumEltsAs<0, 3>,
252 SDTCisVT<4, i8>]>>;
248253 def X86vpperm : SDNode<"X86ISD::VPPERM",
249254 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
250255 SDTCisSameAs<0,2>]>>;
341341 (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
342342 }
343343
344 multiclass xop5op opc, string OpcodeStr, Intrinsic Int128,
345 Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
344 multiclass xop5op opc, string OpcodeStr, SDNode OpNode,
345 ValueType vt128, ValueType vt256,
346 ValueType id128, ValueType id256,
347 PatFrag ld_128, PatFrag ld_256> {
346348 def rr : IXOP5
347349 (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
348350 !strconcat(OpcodeStr,
349351 "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
350352 [(set VR128:$dst,
351 (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
353 (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
354 (id128 VR128:$src3), (i8 imm:$src4))))]>;
352355 def rm : IXOP5
353 (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4),
356 (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
354357 !strconcat(OpcodeStr,
355358 "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
356359 [(set VR128:$dst,
357 (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
360 (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
361 (id128 (bitconvert (loadv2i64 addr:$src3))),
362 (i8 imm:$src4))))]>,
358363 VEX_W, MemOp4;
359364 def mr : IXOP5
360365 (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
361366 !strconcat(OpcodeStr,
362367 "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
363368 [(set VR128:$dst,
364 (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
369 (vt128 (OpNode (vt128 VR128:$src1),
370 (vt128 (bitconvert (ld_128 addr:$src2))),
371 (id128 VR128:$src3), (i8 imm:$src4))))]>;
365372 // For disassembler
366373 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
367374 def rr_REV : IXOP5
375382 !strconcat(OpcodeStr,
376383 "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
377384 [(set VR256:$dst,
378 (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
385 (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
386 (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
379387 def rmY : IXOP5
380 (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4),
388 (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
381389 !strconcat(OpcodeStr,
382390 "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
383391 [(set VR256:$dst,
384 (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
385 VEX_W, MemOp4, VEX_L;
392 (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
393 (id256 (bitconvert (loadv4i64 addr:$src3))),
394 (i8 imm:$src4))))]>, VEX_W, MemOp4, VEX_L;
386395 def mrY : IXOP5
387396 (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
388397 !strconcat(OpcodeStr,
389398 "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
390399 [(set VR256:$dst,
391 (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
392 VEX_L;
400 (vt256 (OpNode (vt256 VR256:$src1),
401 (vt256 (bitconvert (ld_256 addr:$src2))),
402 (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
393403 // For disassembler
394404 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
395405 def rrY_REV : IXOP5
400410 }
401411
402412 let ExeDomain = SSEPackedDouble in
403 defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
404 int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>;
413 defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64,
414 v2i64, v4i64, loadv2f64, loadv4f64>;
405415
406416 let ExeDomain = SSEPackedSingle in
407 defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
408 int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>;
409
417 defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32,
418 v4i32, v8i32, loadv4f32, loadv8f32>;
419
22332233 X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
22342234 X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
22352235 X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
2236 X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
2237 X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
2238 X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
2239 X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
22362240 X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0),
22372241 X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0),
22382242 X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
165165 }
166166 declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
167167
168 define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
168 define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
169169 ;CHECK-LABEL: stack_fold_vpermil2pd_rm
170170 ;CHECK: vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
171171 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
172 %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 0)
172 %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
173173 ret <2 x double> %2
174174 }
175 define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
175 define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x i64> %a1, <2 x double> %a2) {
176176 ;CHECK-LABEL: stack_fold_vpermil2pd_mr
177177 ;CHECK: vpermil2pd $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
178178 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
179 %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x double> %a1, i8 0)
179 %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x i64> %a1, i8 0)
180180 ret <2 x double> %2
181181 }
182 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
183
184 define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
182 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
183
184 define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
185185 ;CHECK-LABEL: stack_fold_vpermil2pd_rm
186186 ;CHECK: vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
187187 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
188 %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 0)
188 %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
189189 ret <4 x double> %2
190190 }
191 define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
191 define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x double> %a2) {
192192 ;CHECK-LABEL: stack_fold_vpermil2pd_mr
193193 ;CHECK: vpermil2pd $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
194194 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
195 %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x double> %a1, i8 0)
195 %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x i64> %a1, i8 0)
196196 ret <4 x double> %2
197197 }
198 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
199
200 define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
198 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
199
200 define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
201201 ;CHECK-LABEL: stack_fold_vpermil2ps_rm
202202 ;CHECK: vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
203203 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
204 %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 0)
204 %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 0)
205205 ret <4 x float> %2
206206 }
207 define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
207 define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x i32> %a1, <4 x float> %a2) {
208208 ;CHECK-LABEL: stack_fold_vpermil2ps_mr
209209 ;CHECK: vpermil2ps $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
210210 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
211 %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x float> %a1, i8 0)
211 %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x i32> %a1, i8 0)
212212 ret <4 x float> %2
213213 }
214 declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
215
216 define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
214 declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
215
216 define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
217217 ;CHECK-LABEL: stack_fold_vpermil2ps_rm
218218 ;CHECK: vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
219219 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
220 %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 0)
220 %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 0)
221221 ret <8 x float> %2
222222 }
223 define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
223 define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x float> %a2) {
224224 ;CHECK-LABEL: stack_fold_vpermil2ps_mr
225225 ;CHECK: vpermil2ps $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
226226 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
227 %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x float> %a1, i8 0)
227 %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x i32> %a1, i8 0)
228228 ret <8 x float> %2
229229 }
230 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
230 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
231231
232232 define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) {
233233 ;CHECK-LABEL: stack_fold_vphaddbd
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s
22 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s
33
4 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
5 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
4 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
5 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
66
7 declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
8 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
7 declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
8 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
99
1010 declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
1111
1212 define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) {
1313 ; CHECK-LABEL: combine_vpermil2pd_identity:
1414 ; CHECK: # BB#0:
15 ; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
15 ; CHECK-NEXT: movl $2, %eax
16 ; CHECK-NEXT: vmovq %rax, %xmm2
1617 ; CHECK-NEXT: vpermil2pd $0, %xmm2, %xmm0, %xmm1, %xmm0
1718 ; CHECK-NEXT: vpermil2pd $0, %xmm2, %xmm0, %xmm0, %xmm0
1819 ; CHECK-NEXT: retq
19 %mask = bitcast <2 x i64> to <2 x double>
20 %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x double> %mask, i8 0)
21 %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x double> %mask, i8 0)
20 %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> , i8 0)
21 %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x i64> , i8 0)
2222 ret <2 x double> %res1
2323 }
2424
2525 define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) {
2626 ; CHECK-LABEL: combine_vpermil2pd256_identity:
2727 ; CHECK: # BB#0:
28 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [9.881313e-324,0.000000e+00,9.881313e-324,0.000000e+00]
28 ; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,2,0]
2929 ; CHECK-NEXT: vpermil2pd $0, %ymm2, %ymm0, %ymm1, %ymm0
3030 ; CHECK-NEXT: vpermil2pd $0, %ymm2, %ymm0, %ymm0, %ymm0
3131 ; CHECK-NEXT: retq
32 %mask = bitcast <4 x i64> to <4 x double>
33 %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x double> %mask, i8 0)
34 %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x double> %mask, i8 0)
32 %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> , i8 0)
33 %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x i64> , i8 0)
3534 ret <4 x double> %res1
3635 }
3736
3837 define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) {
3938 ; CHECK-LABEL: combine_vpermil2ps_identity:
4039 ; CHECK: # BB#0:
41 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4.203895e-45,2.802597e-45,1.401298e-45,0.000000e+00]
40 ; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,1,0]
4241 ; CHECK-NEXT: vpermil2ps $0, %xmm2, %xmm0, %xmm1, %xmm0
4342 ; CHECK-NEXT: vpermil2ps $0, %xmm2, %xmm0, %xmm0, %xmm0
4443 ; CHECK-NEXT: retq
45 %mask = bitcast <4 x i32> to <4 x float>
46 %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x float> %mask, i8 0)
47 %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x float> %mask, i8 0)
44 %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> , i8 0)
45 %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x i32> , i8 0)
4846 ret <4 x float> %res1
4947 }
5048
5149 define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
5250 ; CHECK-LABEL: combine_vpermil2ps256_identity:
5351 ; CHECK: # BB#0:
54 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4.203895e-45,2.802597e-45,1.401298e-45,0.000000e+00,1.401298e-45,0.000000e+00,4.203895e-45,2.802597e-45]
52 ; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,1,0,3,2]
5553 ; CHECK-NEXT: vpermil2ps $0, %ymm2, %ymm0, %ymm1, %ymm0
5654 ; CHECK-NEXT: vpermil2ps $0, %ymm2, %ymm0, %ymm0, %ymm0
5755 ; CHECK-NEXT: retq
58 %mask = bitcast <8 x i32> to <8 x float>
59 %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x float> %mask, i8 0)
60 %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x float> %mask, i8 0)
56 %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> , i8 0)
57 %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x i32> , i8 0)
6158 ret <8 x float> %res1
6259 }
6360
6663 ; CHECK: # BB#0:
6764 ; CHECK-NEXT: vpermil2ps $2, {{.*}}(%rip), %xmm1, %xmm0, %xmm0
6865 ; CHECK-NEXT: retq
69 %mask = bitcast <4 x i32> to <4 x float>
70 %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %mask, i8 2)
66 %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> , i8 2)
7167 ret <4 x float> %res0
7268 }
7369
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s
22
3 define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
4 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
5 ; CHECK: # BB#0:
6 ; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
7 ; CHECK-NEXT: retq
8 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1]
9 ret <2 x double> %res
10 }
11 define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
12 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
13 ; CHECK: # BB#0:
14 ; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
15 ; CHECK-NEXT: retq
16 %vec = load <2 x double>, <2 x double>* %a1
17 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ; [#uses=1]
18 ret <2 x double> %res
19 }
20 define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
21 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
22 ; CHECK: # BB#0:
23 ; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
24 ; CHECK-NEXT: retq
25 %vec = load <2 x double>, <2 x double>* %a2
26 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ; [#uses=1]
27 ret <2 x double> %res
28 }
29 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
30
31 define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
32 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
33 ; CHECK: # BB#0:
34 ; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
35 ; CHECK-NEXT: retq
36 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
37 ret <4 x double> %res
38 }
39 define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
40 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
41 ; CHECK: # BB#0:
42 ; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
43 ; CHECK-NEXT: retq
44 %vec = load <4 x double>, <4 x double>* %a1
45 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ;
46 ret <4 x double> %res
47 }
48 define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
49 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
50 ; CHECK: # BB#0:
51 ; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
52 ; CHECK-NEXT: retq
53 %vec = load <4 x double>, <4 x double>* %a2
54 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ;
55 ret <4 x double> %res
56 }
57 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
58
59 define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
60 ; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
61 ; CHECK: # BB#0:
62 ; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
63 ; CHECK-NEXT: retq
64 %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
65 ret <4 x float> %res
66 }
67 declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
68
69 define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
70 ; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
71 ; CHECK: # BB#0:
72 ; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
73 ; CHECK-NEXT: retq
74 %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
75 ret <8 x float> %res
76 }
77 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
78
379 define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
480 ; CHECK-LABEL: test_int_x86_xop_vpcomeqb:
581 ; CHECK: # BB#0:
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s
22
3 define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
3 define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
44 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
55 ; CHECK: # BB#0:
66 ; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
77 ; CHECK-NEXT: retq
8 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1]
8 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 1) ; [#uses=1]
99 ret <2 x double> %res
1010 }
11 define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
11 define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x i64> %a2) {
1212 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
1313 ; CHECK: # BB#0:
1414 ; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
1515 ; CHECK-NEXT: retq
1616 %vec = load <2 x double>, <2 x double>* %a1
17 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ; [#uses=1]
17 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x i64> %a2, i8 1) ; [#uses=1]
1818 ret <2 x double> %res
1919 }
20 define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
20 define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64>* %a2) {
2121 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
2222 ; CHECK: # BB#0:
2323 ; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
2424 ; CHECK-NEXT: retq
25 %vec = load <2 x double>, <2 x double>* %a2
26 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ; [#uses=1]
25 %vec = load <2 x i64>, <2 x i64>* %a2
26 %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %vec, i8 1) ; [#uses=1]
2727 ret <2 x double> %res
2828 }
29 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
30
31 define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
29 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone
30
31 define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
3232 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
3333 ; CHECK: # BB#0:
3434 ; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
3535 ; CHECK-NEXT: retq
36 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
36 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 2) ;
3737 ret <4 x double> %res
3838 }
39 define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
39 define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x i64> %a2) {
4040 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
4141 ; CHECK: # BB#0:
4242 ; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
4343 ; CHECK-NEXT: retq
4444 %vec = load <4 x double>, <4 x double>* %a1
45 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ;
45 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x i64> %a2, i8 2) ;
4646 ret <4 x double> %res
4747 }
48 define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
48 define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x i64>* %a2) {
4949 ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
5050 ; CHECK: # BB#0:
5151 ; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
5252 ; CHECK-NEXT: retq
53 %vec = load <4 x double>, <4 x double>* %a2
54 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ;
53 %vec = load <4 x i64>, <4 x i64>* %a2
54 %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %vec, i8 2) ;
5555 ret <4 x double> %res
5656 }
57 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
58
59 define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
57 declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone
58
59 define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
6060 ; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
6161 ; CHECK: # BB#0:
6262 ; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
6363 ; CHECK-NEXT: retq
64 %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
64 %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 3) ;
6565 ret <4 x float> %res
6666 }
67 declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
68
69 define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
67 declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone
68
69 define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
7070 ; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
7171 ; CHECK: # BB#0:
7272 ; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
7373 ; CHECK-NEXT: retq
74 %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
74 %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 4) ;
7575 ret <8 x float> %res
7676 }
77 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
77 declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone
7878
7979 define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
8080 ; CHECK-LABEL: test_int_x86_xop_vpcmov: