llvm.org GIT mirror llvm / a947b2a
[X86] Attempt to pre-truncate arithmetic operations if useful In some cases its more efficient to combine TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) if the binop is legal for the truncated types. This is true for vector integer multiplication (especially vXi64), as well as ADD/AND/XOR/OR in cases where we only need to truncate one of the inputs at runtime (e.g. a duplicated input or an one use constant we can fold). Further work could be done here - scalar cases (especially i64) could often benefit (if we avoid partial registers etc.), other opcodes, and better analysis of when truncating the inputs reduces costs. I have considered implementing this for all targets within the DAGCombiner but wasn't sure we could devise a suitable cost model system that would give us the range we need. Differential Revision: https://reviews.llvm.org/D28219 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@290947 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 years ago
4 changed file(s) with 517 addition(s) and 1017 deletion(s). Raw diff Collapse all Expand all
3183231832 return SDValue();
3183331833 }
3183431834
31835 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
31836 /// the codegen.
31837 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
31838 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
31839 const X86Subtarget &Subtarget,
31840 SDLoc &DL) {
31841 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
31842 SDValue Src = N->getOperand(0);
31843 unsigned Opcode = Src.getOpcode();
31844 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31845
31846 EVT VT = N->getValueType(0);
31847 EVT SrcVT = Src.getValueType();
31848
31849 auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
31850 // TODO: Add extra cases where we can truncate both inputs for the
31851 // cost of one (or none).
31852 // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
31853 if (Op0 == Op1)
31854 return true;
31855
31856 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
31857 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
31858 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
31859 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
31860 };
31861
31862 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
31863 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
31864 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
31865 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
31866 };
31867
31868 // Don't combine if the operation has other uses.
31869 if (!N->isOnlyUserOf(Src.getNode()))
31870 return SDValue();
31871
31872 // Only support vector truncation for now.
31873 // TODO: i64 scalar math would benefit as well.
31874 if (!VT.isVector())
31875 return SDValue();
31876
31877 // In most cases its only worth pre-truncating if we're only facing the cost
31878 // of one truncation.
31879 // i.e. if one of the inputs will constant fold or the input is repeated.
31880 switch (Opcode) {
31881 case ISD::AND:
31882 case ISD::XOR:
31883 case ISD::OR: {
31884 SDValue Op0 = Src.getOperand(0);
31885 SDValue Op1 = Src.getOperand(1);
31886 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
31887 IsRepeatedOpOrOneUseConstant(Op0, Op1))
31888 return TruncateArithmetic(Op0, Op1);
31889 break;
31890 }
31891
31892 case ISD::MUL:
31893 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
31894 // better to truncate if we have the chance.
31895 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
31896 !TLI.isOperationLegal(Opcode, SrcVT))
31897 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
31898 LLVM_FALLTHROUGH;
31899 case ISD::ADD: {
31900 SDValue Op0 = Src.getOperand(0);
31901 SDValue Op1 = Src.getOperand(1);
31902 if (TLI.isOperationLegal(Opcode, VT) &&
31903 IsRepeatedOpOrOneUseConstant(Op0, Op1))
31904 return TruncateArithmetic(Op0, Op1);
31905 break;
31906 }
31907 }
31908
31909 return SDValue();
31910 }
31911
3183531912 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
3183631913 static SDValue
3183731914 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
3201732094 EVT VT = N->getValueType(0);
3201832095 SDValue Src = N->getOperand(0);
3201932096 SDLoc DL(N);
32097
32098 // Attempt to pre-truncate inputs to arithmetic ops instead.
32099 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
32100 return V;
3202032101
3202132102 // Try to detect AVG pattern first.
3202232103 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
2121 define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
2222 ; KNL-LABEL: any_extend_load_v8i32:
2323 ; KNL: # BB#0:
24 ; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
25 ; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
26 ; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
27 ; KNL-NEXT: vpmovdw %zmm0, %ymm0
24 ; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
25 ; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
2826 ; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
2927 ; KNL-NEXT: vmovq %xmm0, (%rdi)
3028 ; KNL-NEXT: retq
7070 define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
7171 ; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
7272 ; X32-SSE: # BB#0:
73 ; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
73 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
7474 ; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
75 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
7675 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
7776 ; X32-SSE-NEXT: retl
7877 ;
7978 ; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
8079 ; X32-AVX: # BB#0:
81 ; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
8280 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
8381 ; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
82 ; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
8483 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
8584 ; X32-AVX-NEXT: vzeroupper
8685 ; X32-AVX-NEXT: retl
8786 ;
8887 ; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
8988 ; X64-SSE: # BB#0:
90 ; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
89 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
9190 ; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
92 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
9391 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
9492 ; X64-SSE-NEXT: retq
9593 ;
9694 ; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
9795 ; X64-AVX: # BB#0:
98 ; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
9996 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
10097 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
98 ; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
10199 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
102100 ; X64-AVX-NEXT: vzeroupper
103101 ; X64-AVX-NEXT: retq
109107 define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
110108 ; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
111109 ; X32-SSE: # BB#0:
112 ; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
110 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
113111 ; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
114 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
115112 ; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
116113 ; X32-SSE-NEXT: retl
117114 ;
118115 ; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
119116 ; X32-AVX: # BB#0:
120 ; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
121117 ; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
122118 ; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
119 ; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
123120 ; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
124121 ; X32-AVX-NEXT: vzeroupper
125122 ; X32-AVX-NEXT: retl
126123 ;
127124 ; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
128125 ; X64-SSE: # BB#0:
129 ; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
126 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
130127 ; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
131 ; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
132128 ; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
133129 ; X64-SSE-NEXT: retq
134130 ;
135131 ; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
136132 ; X64-AVX: # BB#0:
137 ; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
138133 ; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
139134 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
135 ; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
140136 ; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
141137 ; X64-AVX-NEXT: vzeroupper
142138 ; X64-AVX-NEXT: retq
418418 define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
419419 ; SSE-LABEL: trunc_add_const_v4i64_v4i32:
420420 ; SSE: # BB#0:
421 ; SSE-NEXT: movl $1, %eax
422 ; SSE-NEXT: movd %rax, %xmm2
423 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
424 ; SSE-NEXT: paddq %xmm2, %xmm0
425 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
426421 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
422 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
427423 ; SSE-NEXT: retq
428424 ;
429425 ; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
430426 ; AVX1: # BB#0:
431 ; AVX1-NEXT: movl $1, %eax
432 ; AVX1-NEXT: vmovq %rax, %xmm1
433 ; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
434 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
435 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
436 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
437 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
427 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
428 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
429 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
438430 ; AVX1-NEXT: vzeroupper
439431 ; AVX1-NEXT: retq
440432 ;
441433 ; AVX2-LABEL: trunc_add_const_v4i64_v4i32:
442434 ; AVX2: # BB#0:
443 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
444435 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
445436 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
446 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
437 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
447438 ; AVX2-NEXT: vzeroupper
448439 ; AVX2-NEXT: retq
449440 ;
450441 ; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
451442 ; AVX512: # BB#0:
452 ; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
443 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
453444 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
454 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
445 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
455446 ; AVX512-NEXT: retq
456447 %1 = add <4 x i64> %a0,
457448 %2 = trunc <4 x i64> %1 to <4 x i32>
461452 define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
462453 ; SSE-LABEL: trunc_add_const_v8i64_v8i16:
463454 ; SSE: # BB#0:
464 ; SSE-NEXT: movl $1, %eax
465 ; SSE-NEXT: movd %rax, %xmm4
466 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
467 ; SSE-NEXT: paddq %xmm0, %xmm4
468 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
469 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
470 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
471 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
472 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
473 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
474 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
475 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
476 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
477 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
455 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
456 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
457 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
458 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
459 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
478460 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
479461 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
480 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
481 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
462 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
463 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
464 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
465 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
466 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm2
467 ; SSE-NEXT: movdqa %xmm2, %xmm0
482468 ; SSE-NEXT: retq
483469 ;
484470 ; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
485471 ; AVX1: # BB#0:
486 ; AVX1-NEXT: movl $1, %eax
487 ; AVX1-NEXT: vmovq %rax, %xmm2
488 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
489 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
490 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
491 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
492 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3
493 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
494 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
495 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
496 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
497 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
498 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
499 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
500 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
501 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
472 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
473 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
474 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
475 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
476 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
477 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
478 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
479 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
480 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
502481 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
482 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
503483 ; AVX1-NEXT: vzeroupper
504484 ; AVX1-NEXT: retq
505485 ;
506486 ; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
507487 ; AVX2: # BB#0:
508 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
509 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
510488 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
511489 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
512490 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
514492 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
515493 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
516494 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
517 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
495 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
518496 ; AVX2-NEXT: vzeroupper
519497 ; AVX2-NEXT: retq
520498 ;
521499 ; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
522500 ; AVX512: # BB#0:
523 ; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
524501 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
502 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
525503 ; AVX512-NEXT: retq
526504 %1 = add <8 x i64> %a0,
527505 %2 = trunc <8 x i64> %1 to <8 x i16>
531509 define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
532510 ; SSE-LABEL: trunc_add_const_v8i32_v8i16:
533511 ; SSE: # BB#0:
534 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
535 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
536512 ; SSE-NEXT: pslld $16, %xmm1
537513 ; SSE-NEXT: psrad $16, %xmm1
538514 ; SSE-NEXT: pslld $16, %xmm0
539515 ; SSE-NEXT: psrad $16, %xmm0
540516 ; SSE-NEXT: packssdw %xmm1, %xmm0
517 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
541518 ; SSE-NEXT: retq
542519 ;
543520 ; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
544521 ; AVX1: # BB#0:
545 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1
546 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
547 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
522 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
548523 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
524 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
549525 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
550 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
551 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
526 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
527 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
552528 ; AVX1-NEXT: vzeroupper
553529 ; AVX1-NEXT: retq
554530 ;
555531 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
556532 ; AVX2: # BB#0:
557 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
558533 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
559534 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
560 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
535 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
561536 ; AVX2-NEXT: vzeroupper
562537 ; AVX2-NEXT: retq
563538 ;
564539 ; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
565540 ; AVX512: # BB#0:
566 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
541 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
567542 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
568 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
543 ; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
569544 ; AVX512-NEXT: retq
570545 %1 = add <8 x i32> %a0,
571546 %2 = trunc <8 x i32> %1 to <8 x i16>
575550 define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
576551 ; SSE-LABEL: trunc_add_const_v16i64_v16i8:
577552 ; SSE: # BB#0:
578 ; SSE-NEXT: movl $1, %eax
579 ; SSE-NEXT: movd %rax, %xmm8
580 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
581 ; SSE-NEXT: paddq %xmm8, %xmm0
582 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
583 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
584 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
585 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm4
586 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm5
587 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm6
588 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm7
589553 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
590554 ; SSE-NEXT: pand %xmm8, %xmm7
591555 ; SSE-NEXT: pand %xmm8, %xmm6
602566 ; SSE-NEXT: packuswb %xmm1, %xmm0
603567 ; SSE-NEXT: packuswb %xmm2, %xmm0
604568 ; SSE-NEXT: packuswb %xmm4, %xmm0
569 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
605570 ; SSE-NEXT: retq
606571 ;
607572 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
608573 ; AVX1: # BB#0:
609 ; AVX1-NEXT: movl $1, %eax
610 ; AVX1-NEXT: vmovq %rax, %xmm4
611 ; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
612 ; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
613 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
614 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
615 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5
616 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
617 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
618 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6
619 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
620 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2
621 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7
622 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
623 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3
624 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
625 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
626 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
627 ; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
628 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
629 ; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
630 ; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
574 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
575 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
576 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
577 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
578 ; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
579 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
580 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
581 ; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
582 ; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
631583 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
632 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
633 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
634 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
635 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
636 ; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
637 ; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
584 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
585 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
586 ; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
587 ; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
588 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
589 ; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
590 ; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
591 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
638592 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
639593 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
594 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
640595 ; AVX1-NEXT: vzeroupper
641596 ; AVX1-NEXT: retq
642597 ;
643598 ; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
644599 ; AVX2: # BB#0:
645 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
646 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
647 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3
648 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2
649600 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
650601 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
651602 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
665616 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
666617 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
667618 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
619 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
668620 ; AVX2-NEXT: vzeroupper
669621 ; AVX2-NEXT: retq
670622 ;
671623 ; AVX512F-LABEL: trunc_add_const_v16i64_v16i8:
672624 ; AVX512F: # BB#0:
673 ; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
674 ; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
675625 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
676626 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
677627 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
678628 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
629 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
679630 ; AVX512F-NEXT: retq
680631 ;
681632 ; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
682633 ; AVX512BW: # BB#0:
683 ; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
684 ; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
685634 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
686635 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
687636 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
688637 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
638 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
689639 ; AVX512BW-NEXT: retq
690640 ;
691641 ; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
692642 ; AVX512DQ: # BB#0:
693 ; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
694 ; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
695643 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
696644 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
697645 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
698646 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
647 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
699648 ; AVX512DQ-NEXT: retq
700649 %1 = add <16 x i64> %a0,
701650 %2 = trunc <16 x i64> %1 to <16 x i8>
705654 define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
706655 ; SSE-LABEL: trunc_add_const_v16i32_v16i8:
707656 ; SSE: # BB#0:
708 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
709 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
710 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm2
711 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm3
712657 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
713658 ; SSE-NEXT: pand %xmm4, %xmm3
714659 ; SSE-NEXT: pand %xmm4, %xmm2
717662 ; SSE-NEXT: pand %xmm4, %xmm0
718663 ; SSE-NEXT: packuswb %xmm1, %xmm0
719664 ; SSE-NEXT: packuswb %xmm2, %xmm0
665 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
720666 ; SSE-NEXT: retq
721667 ;
722668 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
723669 ; AVX1: # BB#0:
724 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2
725 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
726 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
727 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3
728 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
729 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
730 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
731 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
732 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
733 ; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
734 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
735 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
736 ; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
670 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
671 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
672 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
673 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
674 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
675 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
676 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
677 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
678 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
737679 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
680 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
738681 ; AVX1-NEXT: vzeroupper
739682 ; AVX1-NEXT: retq
740683 ;
741684 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
742685 ; AVX2: # BB#0:
743 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
744 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1
745686 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
746687 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
747688 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
751692 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
752693 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
753694 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
695 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
754696 ; AVX2-NEXT: vzeroupper
755697 ; AVX2-NEXT: retq
756698 ;
757699 ; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
758700 ; AVX512: # BB#0:
759 ; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
760701 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
702 ; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
761703 ; AVX512-NEXT: retq
762704 %1 = add <16 x i32> %a0,
763705 %2 = trunc <16 x i32> %1 to <16 x i8>
767709 define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
768710 ; SSE-LABEL: trunc_add_const_v16i16_v16i8:
769711 ; SSE: # BB#0:
770 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
771 ; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
772712 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
773713 ; SSE-NEXT: pand %xmm2, %xmm1
774714 ; SSE-NEXT: pand %xmm2, %xmm0
775715 ; SSE-NEXT: packuswb %xmm1, %xmm0
716 ; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
776717 ; SSE-NEXT: retq
777718 ;
778719 ; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
779720 ; AVX1: # BB#0:
780 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1
781 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
782 ; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
721 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
783722 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
723 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
784724 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
785 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
786 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
725 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
726 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
787727 ; AVX1-NEXT: vzeroupper
788728 ; AVX1-NEXT: retq
789729 ;
790730 ; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
791731 ; AVX2: # BB#0:
792 ; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
793732 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
794733 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
795734 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
796735 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
797736 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
737 ; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
798738 ; AVX2-NEXT: vzeroupper
799739 ; AVX2-NEXT: retq
800740 ;
801741 ; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
802742 ; AVX512F: # BB#0:
803 ; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
804743 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
805744 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
745 ; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
806746 ; AVX512F-NEXT: retq
807747 ;
808748 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
809749 ; AVX512BW: # BB#0:
810 ; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
750 ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0
811751 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
812 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0
752 ; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
813753 ; AVX512BW-NEXT: retq
814754 ;
815755 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
816756 ; AVX512DQ: # BB#0:
817 ; AVX512DQ-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
818757 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
819758 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
759 ; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
820760 ; AVX512DQ-NEXT: retq
821761 %1 = add <16 x i16> %a0,
822762 %2 = trunc <16 x i16> %1 to <16 x i8>
16751615 ; AVX1-LABEL: trunc_mul_v4i64_v4i32:
16761616 ; AVX1: # BB#0:
16771617 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1678 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1679 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
1680 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
1681 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
1682 ; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5
1683 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
1684 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
1685 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
1686 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
1687 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
1688 ; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
1689 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
1690 ; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
1691 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
1692 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
1693 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
1694 ; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
1618 ; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
1619 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
16951620 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
1621 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
16961622 ; AVX1-NEXT: vzeroupper
16971623 ; AVX1-NEXT: retq
16981624 ;
16991625 ; AVX2-LABEL: trunc_mul_v4i64_v4i32:
17001626 ; AVX2: # BB#0:
1701 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
1702 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
1703 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
1704 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
1705 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
1706 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
1707 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1708 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
1627 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
1628 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
17091629 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
17101630 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1711 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
1631 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
17121632 ; AVX2-NEXT: vzeroupper
17131633 ; AVX2-NEXT: retq
17141634 ;
17151635 ; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
17161636 ; AVX512F: # BB#0:
1717 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm2
1718 ; AVX512F-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
1719 ; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3
1720 ; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
1721 ; AVX512F-NEXT: vpaddq %ymm2, %ymm3, %ymm2
1722 ; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2
1723 ; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1724 ; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm0
1637 ; AVX512F-NEXT: # kill: %YMM1 %YMM1 %ZMM1
1638 ; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0
1639 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
17251640 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
1726 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0
1641 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
17271642 ; AVX512F-NEXT: retq
17281643 ;
17291644 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
17301645 ; AVX512BW: # BB#0:
1731 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
1732 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
1733 ; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3
1734 ; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
1735 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
1736 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
1737 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
1738 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
1646 ; AVX512BW-NEXT: # kill: %YMM1 %YMM1 %ZMM1
1647 ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0
1648 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
17391649 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
1740 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0
1650 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
17411651 ; AVX512BW-NEXT: retq
17421652 ;
17431653 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
17561666 define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
17571667 ; SSE-LABEL: trunc_mul_v8i64_v8i16:
17581668 ; SSE: # BB#0:
1759 ; SSE-NEXT: movdqa %xmm0, %xmm8
1760 ; SSE-NEXT: psrlq $32, %xmm8
1761 ; SSE-NEXT: pmuludq %xmm4, %xmm8
1762 ; SSE-NEXT: movdqa %xmm4, %xmm9
1763 ; SSE-NEXT: psrlq $32, %xmm9
1764 ; SSE-NEXT: pmuludq %xmm0, %xmm9
1765 ; SSE-NEXT: paddq %xmm8, %xmm9
1766 ; SSE-NEXT: psllq $32, %xmm9
1767 ; SSE-NEXT: pmuludq %xmm4, %xmm0
1768 ; SSE-NEXT: paddq %xmm9, %xmm0
1769 ; SSE-NEXT: movdqa %xmm1, %xmm8
1770 ; SSE-NEXT: psrlq $32, %xmm8
1771 ; SSE-NEXT: pmuludq %xmm5, %xmm8
1772 ; SSE-NEXT: movdqa %xmm5, %xmm4
1773 ; SSE-NEXT: psrlq $32, %xmm4
1774 ; SSE-NEXT: pmuludq %xmm1, %xmm4
1775 ; SSE-NEXT: paddq %xmm8, %xmm4
1776 ; SSE-NEXT: psllq $32, %xmm4
1777 ; SSE-NEXT: pmuludq %xmm5, %xmm1
1778 ; SSE-NEXT: paddq %xmm4, %xmm1
1779 ; SSE-NEXT: movdqa %xmm2, %xmm4
1780 ; SSE-NEXT: psrlq $32, %xmm4
1781 ; SSE-NEXT: pmuludq %xmm6, %xmm4
1782 ; SSE-NEXT: movdqa %xmm6, %xmm5
1783 ; SSE-NEXT: psrlq $32, %xmm5
1784 ; SSE-NEXT: pmuludq %xmm2, %xmm5
1785 ; SSE-NEXT: paddq %xmm4, %xmm5
1786 ; SSE-NEXT: psllq $32, %xmm5
1787 ; SSE-NEXT: pmuludq %xmm6, %xmm2
1788 ; SSE-NEXT: paddq %xmm5, %xmm2
1789 ; SSE-NEXT: movdqa %xmm3, %xmm4
1790 ; SSE-NEXT: psrlq $32, %xmm4
1791 ; SSE-NEXT: pmuludq %xmm7, %xmm4
1792 ; SSE-NEXT: movdqa %xmm7, %xmm5
1793 ; SSE-NEXT: psrlq $32, %xmm5
1794 ; SSE-NEXT: pmuludq %xmm3, %xmm5
1795 ; SSE-NEXT: paddq %xmm4, %xmm5
1796 ; SSE-NEXT: psllq $32, %xmm5
1797 ; SSE-NEXT: pmuludq %xmm7, %xmm3
1798 ; SSE-NEXT: paddq %xmm5, %xmm3
1669 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1670 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
1671 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
1672 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
1673 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
1674 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
1675 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
1676 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1677 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
1678 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1679 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
17991680 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
18001681 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
18011682 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
18071688 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
18081689 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
18091690 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
1810 ; SSE-NEXT: movapd %xmm2, %xmm0
1691 ; SSE-NEXT: pmullw %xmm6, %xmm2
1692 ; SSE-NEXT: movdqa %xmm2, %xmm0
18111693 ; SSE-NEXT: retq
18121694 ;
18131695 ; AVX1-LABEL: trunc_mul_v8i64_v8i16:
18141696 ; AVX1: # BB#0:
1815 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
1816 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
1817 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
1818 ; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
1819 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
1820 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
1821 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5
1822 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
1823 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
1824 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1825 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5
1826 ; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5
1827 ; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
1828 ; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
1829 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
1830 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
1831 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
1832 ; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
1833 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2
1834 ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
1835 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
1836 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
1837 ; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
1838 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
1839 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5
1840 ; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
1841 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
1842 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1843 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
1844 ; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5
1845 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
1846 ; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
1847 ; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
1848 ; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
1849 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
1850 ; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
1851 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
1852 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
1853 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
1854 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1855 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
1856 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
1857 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
1697 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
1698 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
1699 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
1700 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1701 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
1702 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
1703 ; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
1704 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7]
1705 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
1706 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1707 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1708 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1709 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
1710 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
1711 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
1712 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
1713 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
1714 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
18581715 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
1716 ; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
18591717 ; AVX1-NEXT: vzeroupper
18601718 ; AVX1-NEXT: retq
18611719 ;
18621720 ; AVX2-LABEL: trunc_mul_v8i64_v8i16:
18631721 ; AVX2: # BB#0:
1864 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4
1865 ; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4
1866 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
1867 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
1868 ; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
1869 ; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
1870 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
1871 ; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1
1872 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
1873 ; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
1874 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
1875 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
1876 ; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
1877 ; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
1878 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
1879 ; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
1722 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
1723 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
1724 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
1725 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
1726 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
1727 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
1728 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1729 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
18801730 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
18811731 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
18821732 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
18831733 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
18841734 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1885 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1886 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1887 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
1735 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1736 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
1737 ; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
18881738 ; AVX2-NEXT: vzeroupper
18891739 ; AVX2-NEXT: retq
18901740 ;
18911741 ; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
18921742 ; AVX512F: # BB#0:
1893 ; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm2
1894 ; AVX512F-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
1895 ; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm3
1896 ; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
1897 ; AVX512F-NEXT: vpaddq %zmm2, %zmm3, %zmm2
1898 ; AVX512F-NEXT: vpsllq $32, %zmm2, %zmm2
1899 ; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
1900 ; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0
1743 ; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
19011744 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
1745 ; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
19021746 ; AVX512F-NEXT: retq
19031747 ;
19041748 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
19051749 ; AVX512BW: # BB#0:
1906 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
1907 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
1908 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
1909 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
1910 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
1911 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
1912 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
1913 ; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
1750 ; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
19141751 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
1752 ; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
19151753 ; AVX512BW-NEXT: retq
19161754 ;
19171755 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
21852023 ;
21862024 ; AVX2-LABEL: trunc_mul_v16i64_v16i8:
21872025 ; AVX2: # BB#0:
2188 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm8
2189 ; AVX2-NEXT: vpmuludq %ymm5, %ymm8, %ymm8
2190 ; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9
2191 ; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9
2192 ; AVX2-NEXT: vpaddq %ymm8, %ymm9, %ymm8
2193 ; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
2194 ; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
2195 ; AVX2-NEXT: vpaddq %ymm8, %ymm1, %ymm1
2196 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm5
2197 ; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm5
2198 ; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8
2199 ; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8
2200 ; AVX2-NEXT: vpaddq %ymm5, %ymm8, %ymm5
2201 ; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
2202 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
2203 ; AVX2-NEXT: vpaddq %ymm5, %ymm0, %ymm0
2204 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
2205 ; AVX2-NEXT: vpmuludq %ymm7, %ymm4, %ymm4
2206 ; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5
2207 ; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5
2208 ; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
2209 ; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
2210 ; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
2211 ; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3
2212 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
2213 ; AVX2-NEXT: vpmuludq %ymm6, %ymm4, %ymm4
2214 ; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5
2215 ; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5
2216 ; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
2217 ; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
2218 ; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
2219 ; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2
2026 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
2027 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
2028 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
2029 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2030 ; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3
2031 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
2032 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
22202033 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
22212034 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2222 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
2223 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2035 ; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2
22242036 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
22252037 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
22262038 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
22272039 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2228 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2229 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
2230 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2231 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2040 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
2041 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
2042 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
2043 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
22322044 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
22332045 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2046 ; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1
2047 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
2048 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
2049 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
2050 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2051 ; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0
22342052 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
22352053 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
22362054 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2237 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
2055 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
22382056 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
22392057 ; AVX2-NEXT: vzeroupper
22402058 ; AVX2-NEXT: retq
22412059 ;
22422060 ; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
22432061 ; AVX512F: # BB#0:
2244 ; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm4
2245 ; AVX512F-NEXT: vpmuludq %zmm3, %zmm4, %zmm4
2246 ; AVX512F-NEXT: vpsrlq $32, %zmm3, %zmm5
2247 ; AVX512F-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
2248 ; AVX512F-NEXT: vpaddq %zmm4, %zmm5, %zmm4
2249 ; AVX512F-NEXT: vpsllq $32, %zmm4, %zmm4
2250 ; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
2251 ; AVX512F-NEXT: vpaddq %zmm4, %zmm1, %zmm1
2252 ; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3
2253 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm3, %zmm3
2254 ; AVX512F-NEXT: vpsrlq $32, %zmm2, %zmm4
2255 ; AVX512F-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
2256 ; AVX512F-NEXT: vpaddq %zmm3, %zmm4, %zmm3
2257 ; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3
2258 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
2259 ; AVX512F-NEXT: vpaddq %zmm3, %zmm0, %zmm0
2062 ; AVX512F-NEXT: vpmovqd %zmm3, %ymm3
2063 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
2064 ; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2065 ; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
22602066 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
2261 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
2067 ; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0
22622068 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
22632069 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
22642070 ; AVX512F-NEXT: retq
22652071 ;
22662072 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
22672073 ; AVX512BW: # BB#0:
2268 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm4
2269 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm4, %zmm4
2270 ; AVX512BW-NEXT: vpsrlq $32, %zmm3, %zmm5
2271 ; AVX512BW-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
2272 ; AVX512BW-NEXT: vpaddq %zmm4, %zmm5, %zmm4
2273 ; AVX512BW-NEXT: vpsllq $32, %zmm4, %zmm4
2274 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
2275 ; AVX512BW-NEXT: vpaddq %zmm4, %zmm1, %zmm1
2276 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3
2277 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm3, %zmm3
2278 ; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm4
2279 ; AVX512BW-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
2280 ; AVX512BW-NEXT: vpaddq %zmm3, %zmm4, %zmm3
2281 ; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3
2282 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
2283 ; AVX512BW-NEXT: vpaddq %zmm3, %zmm0, %zmm0
2074 ; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3
2075 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
2076 ; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1
2077 ; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2
22842078 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
2285 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
2079 ; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
22862080 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
22872081 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
22882082 ; AVX512BW-NEXT: retq
24782272 ; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
24792273 ; AVX1: # BB#0:
24802274 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2481 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3]
2482 ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm3
2483 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
2484 ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
2485 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
2486 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
2487 ; AVX1-NEXT: movl $1, %eax
2488 ; AVX1-NEXT: vmovq %rax, %xmm2
2489 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2490 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
2491 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
2492 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
2493 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2494 ; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
24952275 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2276 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
24962277 ; AVX1-NEXT: vzeroupper
24972278 ; AVX1-NEXT: retq
24982279 ;
24992280 ; AVX2-LABEL: trunc_mul_const_v4i64_v4i32:
25002281 ; AVX2: # BB#0:
2501 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
2502 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
2503 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
2504 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
2505 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
2506 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
25072282 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
25082283 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2509 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
2510 ; AVX2-NEXT: vzeroupper
2511 ; AVX2-NEXT: retq
2512 ;
2513 ; AVX512F-LABEL: trunc_mul_const_v4i64_v4i32:
2514 ; AVX512F: # BB#0:
2515 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
2516 ; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
2517 ; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
2518 ; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
2519 ; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0
2520 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
2521 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
2522 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0
2523 ; AVX512F-NEXT: retq
2524 ;
2525 ; AVX512BW-LABEL: trunc_mul_const_v4i64_v4i32:
2526 ; AVX512BW: # BB#0:
2527 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
2528 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
2529 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0
2530 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
2531 ; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0
2532 ; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0
2533 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
2534 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0
2535 ; AVX512BW-NEXT: retq
2536 ;
2537 ; AVX512DQ-LABEL: trunc_mul_const_v4i64_v4i32:
2538 ; AVX512DQ: # BB#0:
2539 ; AVX512DQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0
2540 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
2541 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
2542 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
2543 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0
2544 ; AVX512DQ-NEXT: retq
2284 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2285 ; AVX2-NEXT: vzeroupper
2286 ; AVX2-NEXT: retq
2287 ;
2288 ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
2289 ; AVX512: # BB#0:
2290 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
2291 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
2292 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2293 ; AVX512-NEXT: retq
25452294 %1 = mul <4 x i64> %a0,
25462295 %2 = trunc <4 x i64> %1 to <4 x i32>
25472296 ret <4 x i32> %2
25502299 define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
25512300 ; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
25522301 ; SSE: # BB#0:
2553 ; SSE-NEXT: movl $1, %eax
2554 ; SSE-NEXT: movd %rax, %xmm4
2555 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
2556 ; SSE-NEXT: movdqa %xmm0, %xmm5
2557 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2558 ; SSE-NEXT: psrlq $32, %xmm0
2559 ; SSE-NEXT: pmuludq %xmm4, %xmm0
2560 ; SSE-NEXT: psllq $32, %xmm0
2561 ; SSE-NEXT: paddq %xmm5, %xmm0
2562 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3]
2563 ; SSE-NEXT: movdqa %xmm1, %xmm5
2564 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2565 ; SSE-NEXT: psrlq $32, %xmm1
2566 ; SSE-NEXT: pmuludq %xmm4, %xmm1
2567 ; SSE-NEXT: psllq $32, %xmm1
2568 ; SSE-NEXT: paddq %xmm5, %xmm1
2569 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5]
2570 ; SSE-NEXT: movdqa %xmm2, %xmm5
2571 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2572 ; SSE-NEXT: psrlq $32, %xmm2
2573 ; SSE-NEXT: pmuludq %xmm4, %xmm2
2574 ; SSE-NEXT: psllq $32, %xmm2
2575 ; SSE-NEXT: paddq %xmm5, %xmm2
2576 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7]
2577 ; SSE-NEXT: movdqa %xmm3, %xmm5
2578 ; SSE-NEXT: pmuludq %xmm4, %xmm5
2579 ; SSE-NEXT: psrlq $32, %xmm3
2580 ; SSE-NEXT: pmuludq %xmm4, %xmm3
2581 ; SSE-NEXT: psllq $32, %xmm3
2582 ; SSE-NEXT: paddq %xmm5, %xmm3
25832302 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
25842303 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
25852304 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
25912310 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
25922311 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
25932312 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2594 ; SSE-NEXT: movapd %xmm2, %xmm0
2313 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2
2314 ; SSE-NEXT: movdqa %xmm2, %xmm0
25952315 ; SSE-NEXT: retq
25962316 ;
25972317 ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
25982318 ; AVX1: # BB#0:
2599 ; AVX1-NEXT: movl $1, %eax
2600 ; AVX1-NEXT: vmovq %rax, %xmm2
2601 ; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
2602 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
2603 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
2604 ; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
2605 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
2606 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
2607 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2608 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3]
2609 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4
2610 ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
2611 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
2612 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
2613 ; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
2614 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5]
2615 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
2616 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
2617 ; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
2618 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
2619 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
2620 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2621 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7]
2622 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5
2623 ; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
2624 ; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
2625 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
2626 ; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
2627 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
2628 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
2629 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
2630 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
2631 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
2632 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
2633 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
2319 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2320 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
2321 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2322 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
2323 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2324 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2325 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
2326 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
2327 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
26342328 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2329 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
26352330 ; AVX1-NEXT: vzeroupper
26362331 ; AVX1-NEXT: retq
26372332 ;
26382333 ; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
26392334 ; AVX2: # BB#0:
2640 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7]
2641 ; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
2642 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
2643 ; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
2644 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
2645 ; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
2646 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
2647 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
2648 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
2649 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
2650 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
2651 ; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
26522335 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
26532336 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
26542337 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
26562339 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
26572340 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
26582341 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2659 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
2660 ; AVX2-NEXT: vzeroupper
2661 ; AVX2-NEXT: retq
2662 ;
2663 ; AVX512F-LABEL: trunc_mul_const_v8i64_v8i16:
2664 ; AVX512F: # BB#0:
2665 ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7]
2666 ; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
2667 ; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0
2668 ; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
2669 ; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0
2670 ; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0
2671 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
2672 ; AVX512F-NEXT: retq
2673 ;
2674 ; AVX512BW-LABEL: trunc_mul_const_v8i64_v8i16:
2675 ; AVX512BW: # BB#0:
2676 ; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7]
2677 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
2678 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0
2679 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
2680 ; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0
2681 ; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0
2682 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
2683 ; AVX512BW-NEXT: retq
2684 ;
2685 ; AVX512DQ-LABEL: trunc_mul_const_v8i64_v8i16:
2686 ; AVX512DQ: # BB#0:
2687 ; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0
2688 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
2689 ; AVX512DQ-NEXT: retq
2342 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2343 ; AVX2-NEXT: vzeroupper
2344 ; AVX2-NEXT: retq
2345 ;
2346 ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
2347 ; AVX512: # BB#0:
2348 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2349 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
2350 ; AVX512-NEXT: retq
26902351 %1 = mul <8 x i64> %a0,
26912352 %2 = trunc <8 x i64> %1 to <8 x i16>
26922353 ret <8 x i16> %2
26952356 define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
26962357 ; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
26972358 ; SSE: # BB#0:
2698 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3]
2699 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2700 ; SSE-NEXT: pmuludq %xmm2, %xmm0
2701 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2702 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2703 ; SSE-NEXT: pmuludq %xmm3, %xmm2
2704 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2705 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2706 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7]
2707 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
2708 ; SSE-NEXT: pmuludq %xmm2, %xmm1
2709 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2710 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2711 ; SSE-NEXT: pmuludq %xmm3, %xmm2
2712 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2713 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
27142359 ; SSE-NEXT: pslld $16, %xmm1
27152360 ; SSE-NEXT: psrad $16, %xmm1
27162361 ; SSE-NEXT: pslld $16, %xmm0
27172362 ; SSE-NEXT: psrad $16, %xmm0
27182363 ; SSE-NEXT: packssdw %xmm1, %xmm0
2364 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
27192365 ; SSE-NEXT: retq
27202366 ;
27212367 ; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
27222368 ; AVX1: # BB#0:
2723 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
2724 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2725 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
2369 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
27262370 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2371 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
27272372 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2728 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
2729 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2373 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2374 ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
27302375 ; AVX1-NEXT: vzeroupper
27312376 ; AVX1-NEXT: retq
27322377 ;
27332378 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
27342379 ; AVX2: # BB#0:
2735 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
27362380 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
27372381 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2738 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
2382 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
27392383 ; AVX2-NEXT: vzeroupper
27402384 ; AVX2-NEXT: retq
27412385 ;
27422386 ; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
27432387 ; AVX512: # BB#0:
2744 ; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
2388 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
27452389 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
2746 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
2390 ; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
27472391 ; AVX512-NEXT: retq
27482392 %1 = mul <8 x i32> %a0,
27492393 %2 = trunc <8 x i32> %1 to <8 x i16>
29062550 ;
29072551 ; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
29082552 ; AVX2: # BB#0:
2909 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7]
2910 ; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5
2911 ; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
2912 ; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
2913 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
2914 ; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1
2915 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3]
2916 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
2917 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
2918 ; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
2919 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
2920 ; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
2921 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15]
2922 ; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5
2923 ; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
2924 ; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
2925 ; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
2926 ; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3
2927 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11]
2928 ; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5
2929 ; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
2930 ; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
2931 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
2932 ; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
29332553 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
29342554 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2555 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
29352556 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
29362557 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2558 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
29372559 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
29382560 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
29392561 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
29422564 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
29432565 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
29442566 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2567 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
29452568 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
29462569 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2570 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
29472571 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
29482572 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
29492573 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
29542578 ;
29552579 ; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
29562580 ; AVX512F: # BB#0:
2957 ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15]
2958 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
2959 ; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1
2960 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm1
2961 ; AVX512F-NEXT: vpsllq $32, %zmm1, %zmm1
2962 ; AVX512F-NEXT: vpaddq %zmm1, %zmm3, %zmm1
2963 ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7]
2964 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
2965 ; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0
2966 ; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
2967 ; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0
2968 ; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0
29692581 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
2582 ; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
29702583 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
2584 ; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
29712585 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
29722586 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
29732587 ; AVX512F-NEXT: retq
29742588 ;
29752589 ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
29762590 ; AVX512BW: # BB#0:
2977 ; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15]
2978 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
2979 ; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1
2980 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1
2981 ; AVX512BW-NEXT: vpsllq $32, %zmm1, %zmm1
2982 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm3, %zmm1
2983 ; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7]
2984 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
2985 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0
2986 ; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
2987 ; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0
2988 ; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0
29892591 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
2592 ; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
29902593 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
2594 ; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
29912595 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
29922596 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
29932597 ; AVX512BW-NEXT: retq
29942598 ;
29952599 ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
29962600 ; AVX512DQ: # BB#0:
2997 ; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm1, %zmm1
2998 ; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0
29992601 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
2602 ; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
30002603 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
2604 ; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
30012605 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
30022606 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
30032607 ; AVX512DQ-NEXT: retq
30722676 ;
30732677 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
30742678 ; AVX2: # BB#0:
3075 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
3076 ; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
30772679 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
30782680 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
30792681 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2682 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
30802683 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
30812684 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
30822685 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
30832686 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
2687 ; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
30842688 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
30852689 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
30862690 ; AVX2-NEXT: vzeroupper
35463150 define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
35473151 ; SSE-LABEL: trunc_and_const_v4i64_v4i32:
35483152 ; SSE: # BB#0:
3549 ; SSE-NEXT: movl $1, %eax
3550 ; SSE-NEXT: movd %rax, %xmm2
3551 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
3552 ; SSE-NEXT: pand %xmm2, %xmm0
3553 ; SSE-NEXT: andps {{.*}}(%rip), %xmm1
35543153 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3154 ; SSE-NEXT: andps {{.*}}(%rip), %xmm0
35553155 ; SSE-NEXT: retq
35563156 ;
35573157 ; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
35583158 ; AVX1: # BB#0:
3559 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
35603159 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
35613160 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3161 ; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
35623162 ; AVX1-NEXT: vzeroupper
35633163 ; AVX1-NEXT: retq
35643164 ;
35653165 ; AVX2-LABEL: trunc_and_const_v4i64_v4i32:
35663166 ; AVX2: # BB#0:
3567 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
35683167 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
35693168 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3570 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
3169 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
35713170 ; AVX2-NEXT: vzeroupper
35723171 ; AVX2-NEXT: retq
35733172 ;
35743173 ; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
35753174 ; AVX512: # BB#0:
3576 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3175 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
35773176 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
3578 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
3177 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
35793178 ; AVX512-NEXT: retq
35803179 %1 = and <4 x i64> %a0,
35813180 %2 = trunc <4 x i64> %1 to <4 x i32>
35853184 define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
35863185 ; SSE-LABEL: trunc_and_const_v8i64_v8i16:
35873186 ; SSE: # BB#0:
3588 ; SSE-NEXT: movl $1, %eax
3589 ; SSE-NEXT: movd %rax, %xmm4
3590 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
3591 ; SSE-NEXT: pand %xmm0, %xmm4
3592 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3593 ; SSE-NEXT: pand {{.*}}(%rip), %xmm2
3594 ; SSE-NEXT: pand {{.*}}(%rip), %xmm3
3595 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
3596 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
3597 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
3598 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3599 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3600 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
3601 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
3187 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
3188 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
3189 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3190 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3191 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
36023192 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
36033193 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3604 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3605 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
3194 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3195 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3196 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3197 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3198 ; SSE-NEXT: andpd {{.*}}(%rip), %xmm2
3199 ; SSE-NEXT: movapd %xmm2, %xmm0
36063200 ; SSE-NEXT: retq
36073201 ;
36083202 ; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
36093203 ; AVX1: # BB#0:
3610 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3611 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
36123204 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
36133205 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
36143206 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
36193211 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
36203212 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
36213213 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3214 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
36223215 ; AVX1-NEXT: vzeroupper
36233216 ; AVX1-NEXT: retq
36243217 ;
36253218 ; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
36263219 ; AVX2: # BB#0:
3627 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
3628 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
36293220 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
36303221 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
36313222 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
36333224 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
36343225 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
36353226 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3636 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
3227 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
36373228 ; AVX2-NEXT: vzeroupper
36383229 ; AVX2-NEXT: retq
36393230 ;
36403231 ; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
36413232 ; AVX512: # BB#0:
3642 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
36433233 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3234 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
36443235 ; AVX512-NEXT: retq
36453236 %1 = and <8 x i64> %a0,
36463237 %2 = trunc <8 x i64> %1 to <8 x i16>
36503241 define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
36513242 ; SSE-LABEL: trunc_and_const_v8i32_v8i16:
36523243 ; SSE: # BB#0:
3653 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3654 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
36553244 ; SSE-NEXT: pslld $16, %xmm1
36563245 ; SSE-NEXT: psrad $16, %xmm1
36573246 ; SSE-NEXT: pslld $16, %xmm0
36583247 ; SSE-NEXT: psrad $16, %xmm0
36593248 ; SSE-NEXT: packssdw %xmm1, %xmm0
3249 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
36603250 ; SSE-NEXT: retq
36613251 ;
36623252 ; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
36633253 ; AVX1: # BB#0:
3664 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
36653254 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
36663255 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
36673256 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
36683257 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
36693258 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3259 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
36703260 ; AVX1-NEXT: vzeroupper
36713261 ; AVX1-NEXT: retq
36723262 ;
36733263 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
36743264 ; AVX2: # BB#0:
3675 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
36763265 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
36773266 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
3678 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
3267 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
36793268 ; AVX2-NEXT: vzeroupper
36803269 ; AVX2-NEXT: retq
36813270 ;
36823271 ; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
36833272 ; AVX512: # BB#0:
3684 ; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3273 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
36853274 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
3686 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
3275 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
36873276 ; AVX512-NEXT: retq
36883277 %1 = and <8 x i32> %a0,
36893278 %2 = trunc <8 x i32> %1 to <8 x i16>
36933282 define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
36943283 ; SSE-LABEL: trunc_and_const_v16i64_v16i8:
36953284 ; SSE: # BB#0:
3696 ; SSE-NEXT: movl $1, %eax
3697 ; SSE-NEXT: movd %rax, %xmm8
3698 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
3699 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3700 ; SSE-NEXT: pand {{.*}}(%rip), %xmm2
3701 ; SSE-NEXT: pand {{.*}}(%rip), %xmm3
3702 ; SSE-NEXT: pand {{.*}}(%rip), %xmm4
3703 ; SSE-NEXT: pand {{.*}}(%rip), %xmm5
3704 ; SSE-NEXT: pand {{.*}}(%rip), %xmm6
3705 ; SSE-NEXT: pand {{.*}}(%rip), %xmm7
3706 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3707 ; SSE-NEXT: pand %xmm9, %xmm7
3708 ; SSE-NEXT: pand %xmm9, %xmm6
3285 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
3286 ; SSE-NEXT: pand %xmm8, %xmm7
3287 ; SSE-NEXT: pand %xmm8, %xmm6
37093288 ; SSE-NEXT: packuswb %xmm7, %xmm6
3710 ; SSE-NEXT: pand %xmm9, %xmm5
3711 ; SSE-NEXT: pand %xmm9, %xmm4
3289 ; SSE-NEXT: pand %xmm8, %xmm5
3290 ; SSE-NEXT: pand %xmm8, %xmm4
37123291 ; SSE-NEXT: packuswb %xmm5, %xmm4
37133292 ; SSE-NEXT: packuswb %xmm6, %xmm4
3714 ; SSE-NEXT: pand %xmm9, %xmm3
3715 ; SSE-NEXT: pand %xmm9, %xmm2
3293 ; SSE-NEXT: pand %xmm8, %xmm3
3294 ; SSE-NEXT: pand %xmm8, %xmm2
37163295 ; SSE-NEXT: packuswb %xmm3, %xmm2
3717 ; SSE-NEXT: pand %xmm9, %xmm1
3718 ; SSE-NEXT: pand %xmm9, %xmm8
3296 ; SSE-NEXT: pand %xmm8, %xmm1
37193297 ; SSE-NEXT: pand %xmm8, %xmm0
37203298 ; SSE-NEXT: packuswb %xmm1, %xmm0
37213299 ; SSE-NEXT: packuswb %xmm2, %xmm0
37223300 ; SSE-NEXT: packuswb %xmm4, %xmm0
3301 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
37233302 ; SSE-NEXT: retq
37243303 ;
37253304 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
37263305 ; AVX1: # BB#0:
3727 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3728 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
3729 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
3730 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3
37313306 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
37323307 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
37333308 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
37483323 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
37493324 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
37503325 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
3326 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
37513327 ; AVX1-NEXT: vzeroupper
37523328 ; AVX1-NEXT: retq
37533329 ;
37543330 ; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
37553331 ; AVX2: # BB#0:
3756 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
3757 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3758 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
3759 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
37603332 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
37613333 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
37623334 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
37763348 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
37773349 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
37783350 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3351 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
37793352 ; AVX2-NEXT: vzeroupper
37803353 ; AVX2-NEXT: retq
37813354 ;
37823355 ; AVX512F-LABEL: trunc_and_const_v16i64_v16i8:
37833356 ; AVX512F: # BB#0:
3784 ; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
3785 ; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
37863357 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
37873358 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
37883359 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
37893360 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3361 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
37903362 ; AVX512F-NEXT: retq
37913363 ;
37923364 ; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
37933365 ; AVX512BW: # BB#0:
3794 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
3795 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
37963366 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
37973367 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
37983368 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
37993369 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
3370 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
38003371 ; AVX512BW-NEXT: retq
38013372 ;
38023373 ; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
38033374 ; AVX512DQ: # BB#0:
3804 ; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
3805 ; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
38063375 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
38073376 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
38083377 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
38093378 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3379 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
38103380 ; AVX512DQ-NEXT: retq
38113381 %1 = and <16 x i64> %a0,
38123382 %2 = trunc <16 x i64> %1 to <16 x i8>
38163386 define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
38173387 ; SSE-LABEL: trunc_and_const_v16i32_v16i8:
38183388 ; SSE: # BB#0:
3819 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3820 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
3821 ; SSE-NEXT: pand {{.*}}(%rip), %xmm2
3822 ; SSE-NEXT: pand {{.*}}(%rip), %xmm3
38233389 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
38243390 ; SSE-NEXT: pand %xmm4, %xmm3
38253391 ; SSE-NEXT: pand %xmm4, %xmm2
38283394 ; SSE-NEXT: pand %xmm4, %xmm0
38293395 ; SSE-NEXT: packuswb %xmm1, %xmm0
38303396 ; SSE-NEXT: packuswb %xmm2, %xmm0
3397 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
38313398 ; SSE-NEXT: retq
38323399 ;
38333400 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
38343401 ; AVX1: # BB#0:
3835 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
3836 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
38373402 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
38383403 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
38393404 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
38443409 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
38453410 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
38463411 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
3412 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
38473413 ; AVX1-NEXT: vzeroupper
38483414 ; AVX1-NEXT: retq
38493415 ;
38503416 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
38513417 ; AVX2: # BB#0:
3852 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3853 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
38543418 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
38553419 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
38563420 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
38603424 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
38613425 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
38623426 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3427 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
38633428 ; AVX2-NEXT: vzeroupper
38643429 ; AVX2-NEXT: retq
38653430 ;
38663431 ; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
38673432 ; AVX512: # BB#0:
3868 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
38693433 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
3434 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
38703435 ; AVX512-NEXT: retq
38713436 %1 = and <16 x i32> %a0,
38723437 %2 = trunc <16 x i32> %1 to <16 x i8>
38763441 define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
38773442 ; SSE-LABEL: trunc_and_const_v16i16_v16i8:
38783443 ; SSE: # BB#0:
3879 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
3880 ; SSE-NEXT: pand {{.*}}(%rip), %xmm1
38813444 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
38823445 ; SSE-NEXT: pand %xmm2, %xmm1
38833446 ; SSE-NEXT: pand %xmm2, %xmm0
38843447 ; SSE-NEXT: packuswb %xmm1, %xmm0
3448 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
38853449 ; SSE-NEXT: retq
38863450 ;
38873451 ; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
38883452 ; AVX1: # BB#0:
3889 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
38903453 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
38913454 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
38923455 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
38933456 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
38943457 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3458 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
38953459 ; AVX1-NEXT: vzeroupper
38963460 ; AVX1-NEXT: retq
38973461 ;
38983462 ; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
38993463 ; AVX2: # BB#0:
3900 ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
39013464 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
39023465 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
39033466 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
39043467 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
39053468 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3469 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
39063470 ; AVX2-NEXT: vzeroupper
39073471 ; AVX2-NEXT: retq
39083472 ;
39093473 ; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
39103474 ; AVX512F: # BB#0:
3911 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
39123475 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
39133476 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
3477 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
39143478 ; AVX512F-NEXT: retq
39153479 ;
39163480 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
39173481 ; AVX512BW: # BB#0:
3918 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
3482 ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0
39193483 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
3920 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0
3484 ; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
39213485 ; AVX512BW-NEXT: retq
39223486 ;
39233487 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
39243488 ; AVX512DQ: # BB#0:
3925 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
39263489 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
39273490 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
3491 ; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
39283492 ; AVX512DQ-NEXT: retq
39293493 %1 = and <16 x i16> %a0,
39303494 %2 = trunc <16 x i16> %1 to <16 x i8>
43223886 define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
43233887 ; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
43243888 ; SSE: # BB#0:
4325 ; SSE-NEXT: movl $1, %eax
4326 ; SSE-NEXT: movd %rax, %xmm2
4327 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
4328 ; SSE-NEXT: pxor %xmm2, %xmm0
4329 ; SSE-NEXT: xorps {{.*}}(%rip), %xmm1
43303889 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3890 ; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
43313891 ; SSE-NEXT: retq
43323892 ;
43333893 ; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
43343894 ; AVX1: # BB#0:
4335 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
43363895 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
43373896 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
3897 ; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
43383898 ; AVX1-NEXT: vzeroupper
43393899 ; AVX1-NEXT: retq
43403900 ;
43413901 ; AVX2-LABEL: trunc_xor_const_v4i64_v4i32:
43423902 ; AVX2: # BB#0:
4343 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
43443903 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
43453904 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4346 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
3905 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
43473906 ; AVX2-NEXT: vzeroupper
43483907 ; AVX2-NEXT: retq
43493908 ;
43503909 ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
43513910 ; AVX512: # BB#0:
4352 ; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
3911 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
43533912 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
4354 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
3913 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
43553914 ; AVX512-NEXT: retq
43563915 %1 = xor <4 x i64> %a0,
43573916 %2 = trunc <4 x i64> %1 to <4 x i32>
43613920 define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
43623921 ; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
43633922 ; SSE: # BB#0:
4364 ; SSE-NEXT: movl $1, %eax
4365 ; SSE-NEXT: movd %rax, %xmm4
4366 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
4367 ; SSE-NEXT: pxor %xmm0, %xmm4
4368 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4369 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
4370 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
4371 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
4372 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
4373 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
4374 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
4375 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
4376 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
4377 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
3923 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
3924 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
3925 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3926 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3927 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
43783928 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
43793929 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
4380 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4381 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
3930 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3931 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3932 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3933 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3934 ; SSE-NEXT: xorpd {{.*}}(%rip), %xmm2
3935 ; SSE-NEXT: movapd %xmm2, %xmm0
43823936 ; SSE-NEXT: retq
43833937 ;
43843938 ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
43853939 ; AVX1: # BB#0:
4386 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4387 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
43883940 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
43893941 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
43903942 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
43953947 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
43963948 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
43973949 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
3950 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
43983951 ; AVX1-NEXT: vzeroupper
43993952 ; AVX1-NEXT: retq
44003953 ;
44013954 ; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
44023955 ; AVX2: # BB#0:
4403 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
4404 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
44053956 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
44063957 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
44073958 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
44093960 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
44103961 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
44113962 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4412 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
3963 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
44133964 ; AVX2-NEXT: vzeroupper
44143965 ; AVX2-NEXT: retq
44153966 ;
44163967 ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
44173968 ; AVX512: # BB#0:
4418 ; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
44193969 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
3970 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
44203971 ; AVX512-NEXT: retq
44213972 %1 = xor <8 x i64> %a0,
44223973 %2 = trunc <8 x i64> %1 to <8 x i16>
44263977 define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
44273978 ; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
44283979 ; SSE: # BB#0:
4429 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4430 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
44313980 ; SSE-NEXT: pslld $16, %xmm1
44323981 ; SSE-NEXT: psrad $16, %xmm1
44333982 ; SSE-NEXT: pslld $16, %xmm0
44343983 ; SSE-NEXT: psrad $16, %xmm0
44353984 ; SSE-NEXT: packssdw %xmm1, %xmm0
3985 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
44363986 ; SSE-NEXT: retq
44373987 ;
44383988 ; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
44393989 ; AVX1: # BB#0:
4440 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
44413990 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
44423991 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
44433992 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
44443993 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
44453994 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3995 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
44463996 ; AVX1-NEXT: vzeroupper
44473997 ; AVX1-NEXT: retq
44483998 ;
44493999 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
44504000 ; AVX2: # BB#0:
4451 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
44524001 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
44534002 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
4454 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
4003 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
44554004 ; AVX2-NEXT: vzeroupper
44564005 ; AVX2-NEXT: retq
44574006 ;
44584007 ; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
44594008 ; AVX512: # BB#0:
4460 ; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4009 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
44614010 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
4462 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
4011 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
44634012 ; AVX512-NEXT: retq
44644013 %1 = xor <8 x i32> %a0,
44654014 %2 = trunc <8 x i32> %1 to <8 x i16>
44694018 define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
44704019 ; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
44714020 ; SSE: # BB#0:
4472 ; SSE-NEXT: movl $1, %eax
4473 ; SSE-NEXT: movd %rax, %xmm8
4474 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
4475 ; SSE-NEXT: pxor %xmm8, %xmm0
4476 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4477 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
4478 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
4479 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm4
4480 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm5
4481 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm6
4482 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm7
44834021 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
44844022 ; SSE-NEXT: pand %xmm8, %xmm7
44854023 ; SSE-NEXT: pand %xmm8, %xmm6
44964034 ; SSE-NEXT: packuswb %xmm1, %xmm0
44974035 ; SSE-NEXT: packuswb %xmm2, %xmm0
44984036 ; SSE-NEXT: packuswb %xmm4, %xmm0
4037 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
44994038 ; SSE-NEXT: retq
45004039 ;
45014040 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
45024041 ; AVX1: # BB#0:
4503 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4504 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
4505 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2
4506 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3
45074042 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
45084043 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
45094044 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
45244059 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
45254060 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
45264061 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4062 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
45274063 ; AVX1-NEXT: vzeroupper
45284064 ; AVX1-NEXT: retq
45294065 ;
45304066 ; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
45314067 ; AVX2: # BB#0:
4532 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
4533 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4534 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3
4535 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2
45364068 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
45374069 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
45384070 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
45524084 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
45534085 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
45544086 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4087 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
45554088 ; AVX2-NEXT: vzeroupper
45564089 ; AVX2-NEXT: retq
45574090 ;
45584091 ; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8:
45594092 ; AVX512F: # BB#0:
4560 ; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
4561 ; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
45624093 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
45634094 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
45644095 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
45654096 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4097 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
45664098 ; AVX512F-NEXT: retq
45674099 ;
45684100 ; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
45694101 ; AVX512BW: # BB#0:
4570 ; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
4571 ; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
45724102 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
45734103 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
45744104 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
45754105 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
4106 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
45764107 ; AVX512BW-NEXT: retq
45774108 ;
45784109 ; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
45794110 ; AVX512DQ: # BB#0:
4580 ; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
4581 ; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
45824111 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
45834112 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
45844113 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
45854114 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4115 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
45864116 ; AVX512DQ-NEXT: retq
45874117 %1 = xor <16 x i64> %a0,
45884118 %2 = trunc <16 x i64> %1 to <16 x i8>
45924122 define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
45934123 ; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
45944124 ; SSE: # BB#0:
4595 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4596 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
4597 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
4598 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
45994125 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
46004126 ; SSE-NEXT: pand %xmm4, %xmm3
46014127 ; SSE-NEXT: pand %xmm4, %xmm2
46044130 ; SSE-NEXT: pand %xmm4, %xmm0
46054131 ; SSE-NEXT: packuswb %xmm1, %xmm0
46064132 ; SSE-NEXT: packuswb %xmm2, %xmm0
4133 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
46074134 ; SSE-NEXT: retq
46084135 ;
46094136 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
46104137 ; AVX1: # BB#0:
4611 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
4612 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
46134138 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
46144139 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
46154140 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
46204145 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
46214146 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
46224147 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4148 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
46234149 ; AVX1-NEXT: vzeroupper
46244150 ; AVX1-NEXT: retq
46254151 ;
46264152 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
46274153 ; AVX2: # BB#0:
4628 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4629 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
46304154 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
46314155 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
46324156 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
46364160 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
46374161 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
46384162 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4163 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
46394164 ; AVX2-NEXT: vzeroupper
46404165 ; AVX2-NEXT: retq
46414166 ;
46424167 ; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
46434168 ; AVX512: # BB#0:
4644 ; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0
46454169 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4170 ; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
46464171 ; AVX512-NEXT: retq
46474172 %1 = xor <16 x i32> %a0,
46484173 %2 = trunc <16 x i32> %1 to <16 x i8>
46524177 define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
46534178 ; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
46544179 ; SSE: # BB#0:
4655 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
4656 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
46574180 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
46584181 ; SSE-NEXT: pand %xmm2, %xmm1
46594182 ; SSE-NEXT: pand %xmm2, %xmm0
46604183 ; SSE-NEXT: packuswb %xmm1, %xmm0
4184 ; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
46614185 ; SSE-NEXT: retq
46624186 ;
46634187 ; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
46644188 ; AVX1: # BB#0:
4665 ; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
46664189 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
46674190 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
46684191 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
46694192 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
46704193 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4194 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
46714195 ; AVX1-NEXT: vzeroupper
46724196 ; AVX1-NEXT: retq
46734197 ;
46744198 ; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
46754199 ; AVX2: # BB#0:
4676 ; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
46774200 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
46784201 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
46794202 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
46804203 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
46814204 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4205 ; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
46824206 ; AVX2-NEXT: vzeroupper
46834207 ; AVX2-NEXT: retq
46844208 ;
46854209 ; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
46864210 ; AVX512F: # BB#0:
4687 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
46884211 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
46894212 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4213 ; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
46904214 ; AVX512F-NEXT: retq
46914215 ;
46924216 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
46934217 ; AVX512BW: # BB#0:
4694 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
4218 ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0
46954219 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
4696 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0
4220 ; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
46974221 ; AVX512BW-NEXT: retq
46984222 ;
46994223 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
47004224 ; AVX512DQ: # BB#0:
4701 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
47024225 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
47034226 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4227 ; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
47044228 ; AVX512DQ-NEXT: retq
47054229 %1 = xor <16 x i16> %a0,
47064230 %2 = trunc <16 x i16> %1 to <16 x i8>
50984622 define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
50994623 ; SSE-LABEL: trunc_or_const_v4i64_v4i32:
51004624 ; SSE: # BB#0:
5101 ; SSE-NEXT: movl $1, %eax
5102 ; SSE-NEXT: movd %rax, %xmm2
5103 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
5104 ; SSE-NEXT: por %xmm2, %xmm0
5105 ; SSE-NEXT: orps {{.*}}(%rip), %xmm1
51064625 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4626 ; SSE-NEXT: orps {{.*}}(%rip), %xmm0
51074627 ; SSE-NEXT: retq
51084628 ;
51094629 ; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
51104630 ; AVX1: # BB#0:
5111 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
51124631 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
51134632 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
4633 ; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
51144634 ; AVX1-NEXT: vzeroupper
51154635 ; AVX1-NEXT: retq
51164636 ;
51174637 ; AVX2-LABEL: trunc_or_const_v4i64_v4i32:
51184638 ; AVX2: # BB#0:
5119 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
51204639 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
51214640 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5122 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
4641 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
51234642 ; AVX2-NEXT: vzeroupper
51244643 ; AVX2-NEXT: retq
51254644 ;
51264645 ; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
51274646 ; AVX512: # BB#0:
5128 ; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
4647 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
51294648 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0
5130 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
4649 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
51314650 ; AVX512-NEXT: retq
51324651 %1 = or <4 x i64> %a0,
51334652 %2 = trunc <4 x i64> %1 to <4 x i32>
51374656 define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
51384657 ; SSE-LABEL: trunc_or_const_v8i64_v8i16:
51394658 ; SSE: # BB#0:
5140 ; SSE-NEXT: movl $1, %eax
5141 ; SSE-NEXT: movd %rax, %xmm4
5142 ; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
5143 ; SSE-NEXT: por %xmm0, %xmm4
5144 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
5145 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
5146 ; SSE-NEXT: por {{.*}}(%rip), %xmm3
5147 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
5148 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
5149 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
5150 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
5151 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
5152 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
5153 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
4659 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
4660 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
4661 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4662 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
4663 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
51544664 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
51554665 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
5156 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5157 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4666 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4667 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4668 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4669 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
4670 ; SSE-NEXT: orpd {{.*}}(%rip), %xmm2
4671 ; SSE-NEXT: movapd %xmm2, %xmm0
51584672 ; SSE-NEXT: retq
51594673 ;
51604674 ; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
51614675 ; AVX1: # BB#0:
5162 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
5163 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
51644676 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
51654677 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
51664678 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
51714683 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
51724684 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
51734685 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
4686 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
51744687 ; AVX1-NEXT: vzeroupper
51754688 ; AVX1-NEXT: retq
51764689 ;
51774690 ; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
51784691 ; AVX2: # BB#0:
5179 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
5180 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
51814692 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
51824693 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
51834694 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
51854696 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
51864697 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
51874698 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5188 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
4699 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
51894700 ; AVX2-NEXT: vzeroupper
51904701 ; AVX2-NEXT: retq
51914702 ;
51924703 ; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
51934704 ; AVX512: # BB#0:
5194 ; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
51954705 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
4706 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
51964707 ; AVX512-NEXT: retq
51974708 %1 = or <8 x i64> %a0,
51984709 %2 = trunc <8 x i64> %1 to <8 x i16>
52024713 define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
52034714 ; SSE-LABEL: trunc_or_const_v8i32_v8i16:
52044715 ; SSE: # BB#0:
5205 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5206 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
52074716 ; SSE-NEXT: pslld $16, %xmm1
52084717 ; SSE-NEXT: psrad $16, %xmm1
52094718 ; SSE-NEXT: pslld $16, %xmm0
52104719 ; SSE-NEXT: psrad $16, %xmm0
52114720 ; SSE-NEXT: packssdw %xmm1, %xmm0
4721 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
52124722 ; SSE-NEXT: retq
52134723 ;
52144724 ; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
52154725 ; AVX1: # BB#0:
5216 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
52174726 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
52184727 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
52194728 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
52204729 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
52214730 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4731 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
52224732 ; AVX1-NEXT: vzeroupper
52234733 ; AVX1-NEXT: retq
52244734 ;
52254735 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
52264736 ; AVX2: # BB#0:
5227 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
52284737 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
52294738 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5230 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
4739 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
52314740 ; AVX2-NEXT: vzeroupper
52324741 ; AVX2-NEXT: retq
52334742 ;
52344743 ; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
52354744 ; AVX512: # BB#0:
5236 ; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
4745 ; AVX512-NEXT: # kill: %YMM0 %YMM0 %ZMM0
52374746 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0
5238 ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0
4747 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
52394748 ; AVX512-NEXT: retq
52404749 %1 = or <8 x i32> %a0,
52414750 %2 = trunc <8 x i32> %1 to <8 x i16>
52454754 define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
52464755 ; SSE-LABEL: trunc_or_const_v16i64_v16i8:
52474756 ; SSE: # BB#0:
5248 ; SSE-NEXT: movl $1, %eax
5249 ; SSE-NEXT: movd %rax, %xmm8
5250 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
5251 ; SSE-NEXT: por %xmm8, %xmm0
5252 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
5253 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
5254 ; SSE-NEXT: por {{.*}}(%rip), %xmm3
5255 ; SSE-NEXT: por {{.*}}(%rip), %xmm4
5256 ; SSE-NEXT: por {{.*}}(%rip), %xmm5
5257 ; SSE-NEXT: por {{.*}}(%rip), %xmm6
5258 ; SSE-NEXT: por {{.*}}(%rip), %xmm7
52594757 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
52604758 ; SSE-NEXT: pand %xmm8, %xmm7
52614759 ; SSE-NEXT: pand %xmm8, %xmm6
52724770 ; SSE-NEXT: packuswb %xmm1, %xmm0
52734771 ; SSE-NEXT: packuswb %xmm2, %xmm0
52744772 ; SSE-NEXT: packuswb %xmm4, %xmm0
4773 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
52754774 ; SSE-NEXT: retq
52764775 ;
52774776 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
52784777 ; AVX1: # BB#0:
5279 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
5280 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
5281 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
5282 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3
52834778 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
52844779 ; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
52854780 ; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
53004795 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
53014796 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
53024797 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
4798 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
53034799 ; AVX1-NEXT: vzeroupper
53044800 ; AVX1-NEXT: retq
53054801 ;
53064802 ; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
53074803 ; AVX2: # BB#0:
5308 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
5309 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
5310 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3
5311 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2
53124804 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
53134805 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
53144806 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
53284820 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
53294821 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
53304822 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
4823 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
53314824 ; AVX2-NEXT: vzeroupper
53324825 ; AVX2-NEXT: retq
53334826 ;
53344827 ; AVX512F-LABEL: trunc_or_const_v16i64_v16i8:
53354828 ; AVX512F: # BB#0:
5336 ; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
5337 ; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
53384829 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
53394830 ; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
53404831 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
53414832 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4833 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
53424834 ; AVX512F-NEXT: retq
53434835 ;
53444836 ; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
53454837 ; AVX512BW: # BB#0:
5346 ; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
5347 ; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
53484838 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
53494839 ; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
53504840 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
53514841 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
4842 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
53524843 ; AVX512BW-NEXT: retq
53534844 ;
53544845 ; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
53554846 ; AVX512DQ: # BB#0:
5356 ; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
5357 ; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
53584847 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
53594848 ; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
53604849 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
53614850 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4851 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
53624852 ; AVX512DQ-NEXT: retq
53634853 %1 = or <16 x i64> %a0,
53644854 %2 = trunc <16 x i64> %1 to <16 x i8>
53684858 define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
53694859 ; SSE-LABEL: trunc_or_const_v16i32_v16i8:
53704860 ; SSE: # BB#0:
5371 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5372 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
5373 ; SSE-NEXT: por {{.*}}(%rip), %xmm2
5374 ; SSE-NEXT: por {{.*}}(%rip), %xmm3
53754861 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
53764862 ; SSE-NEXT: pand %xmm4, %xmm3
53774863 ; SSE-NEXT: pand %xmm4, %xmm2
53804866 ; SSE-NEXT: pand %xmm4, %xmm0
53814867 ; SSE-NEXT: packuswb %xmm1, %xmm0
53824868 ; SSE-NEXT: packuswb %xmm2, %xmm0
4869 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
53834870 ; SSE-NEXT: retq
53844871 ;
53854872 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
53864873 ; AVX1: # BB#0:
5387 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
5388 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
53894874 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
53904875 ; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
53914876 ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
53964881 ; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
53974882 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
53984883 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
4884 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
53994885 ; AVX1-NEXT: vzeroupper
54004886 ; AVX1-NEXT: retq
54014887 ;
54024888 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
54034889 ; AVX2: # BB#0:
5404 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
5405 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
54064890 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
54074891 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
54084892 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
54124896 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
54134897 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
54144898 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4899 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
54154900 ; AVX2-NEXT: vzeroupper
54164901 ; AVX2-NEXT: retq
54174902 ;
54184903 ; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
54194904 ; AVX512: # BB#0:
5420 ; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0
54214905 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0
4906 ; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
54224907 ; AVX512-NEXT: retq
54234908 %1 = or <16 x i32> %a0,
54244909 %2 = trunc <16 x i32> %1 to <16 x i8>
54284913 define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
54294914 ; SSE-LABEL: trunc_or_const_v16i16_v16i8:
54304915 ; SSE: # BB#0:
5431 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
5432 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
54334916 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
54344917 ; SSE-NEXT: pand %xmm2, %xmm1
54354918 ; SSE-NEXT: pand %xmm2, %xmm0
54364919 ; SSE-NEXT: packuswb %xmm1, %xmm0
4920 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
54374921 ; SSE-NEXT: retq
54384922 ;
54394923 ; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
54404924 ; AVX1: # BB#0:
5441 ; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
54424925 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
54434926 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
54444927 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
54454928 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
54464929 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4930 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
54474931 ; AVX1-NEXT: vzeroupper
54484932 ; AVX1-NEXT: retq
54494933 ;
54504934 ; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
54514935 ; AVX2: # BB#0:
5452 ; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
54534936 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
54544937 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
54554938 ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
54564939 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
54574940 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4941 ; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
54584942 ; AVX2-NEXT: vzeroupper
54594943 ; AVX2-NEXT: retq
54604944 ;
54614945 ; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
54624946 ; AVX512F: # BB#0:
5463 ; AVX512F-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
54644947 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
54654948 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
4949 ; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
54664950 ; AVX512F-NEXT: retq
54674951 ;
54684952 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
54694953 ; AVX512BW: # BB#0:
5470 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
4954 ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0
54714955 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
5472 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0
4956 ; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
54734957 ; AVX512BW-NEXT: retq
54744958 ;
54754959 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
54764960 ; AVX512DQ: # BB#0:
5477 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
54784961 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
54794962 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
4963 ; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
54804964 ; AVX512DQ-NEXT: retq
54814965 %1 = or <16 x i16> %a0,
54824966 %2 = trunc <16 x i16> %1 to <16 x i8>
54914975 ; SSE-LABEL: mul_add_const_v4i64_v4i32:
54924976 ; SSE: # BB#0:
54934977 ; SSE-NEXT: movdqa %xmm0, %xmm2
5494 ; SSE-NEXT: psrad $31, %xmm2
5495 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
5496 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
5497 ; SSE-NEXT: movdqa %xmm3, %xmm2
5498 ; SSE-NEXT: psrad $31, %xmm2
5499 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5500 ; SSE-NEXT: movdqa %xmm1, %xmm4
5501 ; SSE-NEXT: psrad $31, %xmm4
5502 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
5503 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
4978 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
4979 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
4980 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
4981 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
55044982 ; SSE-NEXT: movdqa %xmm2, %xmm4
5505 ; SSE-NEXT: psrad $31, %xmm4
5506 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
4983 ; SSE-NEXT: psrlq $32, %xmm4
4984 ; SSE-NEXT: pmuludq %xmm1, %xmm4
4985 ; SSE-NEXT: movdqa %xmm1, %xmm5
4986 ; SSE-NEXT: psrlq $32, %xmm5
4987 ; SSE-NEXT: pmuludq %xmm2, %xmm5
4988 ; SSE-NEXT: paddq %xmm4, %xmm5
4989 ; SSE-NEXT: psllq $32, %xmm5
4990 ; SSE-NEXT: pmuludq %xmm1, %xmm2
4991 ; SSE-NEXT: paddq %xmm5, %xmm2
4992 ; SSE-NEXT: movdqa %xmm0, %xmm1
4993 ; SSE-NEXT: psrlq $32, %xmm1
4994 ; SSE-NEXT: pmuludq %xmm3, %xmm1
55074995 ; SSE-NEXT: movdqa %xmm3, %xmm4
55084996 ; SSE-NEXT: psrlq $32, %xmm4
5509 ; SSE-NEXT: pmuludq %xmm2, %xmm4
5510 ; SSE-NEXT: movdqa %xmm2, %xmm5
5511 ; SSE-NEXT: psrlq $32, %xmm5
5512 ; SSE-NEXT: pmuludq %xmm3, %xmm5
5513 ; SSE-NEXT: paddq %xmm4, %xmm5
5514 ; SSE-NEXT: psllq $32, %xmm5
5515 ; SSE-NEXT: pmuludq %xmm3, %xmm2
5516 ; SSE-NEXT: paddq %xmm5, %xmm2
5517 ; SSE-NEXT: movdqa %xmm0, %xmm3
5518 ; SSE-NEXT: psrlq $32, %xmm3
5519 ; SSE-NEXT: pmuludq %xmm1, %xmm3
5520 ; SSE-NEXT: movdqa %xmm1, %xmm4
5521 ; SSE-NEXT: psrlq $32, %xmm4
55224997 ; SSE-NEXT: pmuludq %xmm0, %xmm4
5523 ; SSE-NEXT: paddq %xmm3, %xmm4
4998 ; SSE-NEXT: paddq %xmm1, %xmm4
55244999 ; SSE-NEXT: psllq $32, %xmm4
5525 ; SSE-NEXT: pmuludq %xmm1, %xmm0
5000 ; SSE-NEXT: pmuludq %xmm3, %xmm0
55265001 ; SSE-NEXT: paddq %xmm4, %xmm0
5527 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm0
5528 ; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
55295002 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
5003 ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
55305004 ; SSE-NEXT: retq
55315005 ;
55325006 ; AVX1-LABEL: mul_add_const_v4i64_v4i32:
55335007 ; AVX1: # BB#0:
5534 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
5535 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
5536 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
5537 ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3
5538 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
5539 ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
5540 ; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
5541 ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1
5542 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
5543 ; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
5544 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
5008 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5009 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
55455010 ; AVX1-NEXT: retq
55465011 ;
55475012 ; AVX2-LABEL: mul_add_const_v4i64_v4i32:
55485013 ; AVX2: # BB#0:
5549 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
5550 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
5551 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
5552 ; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
5553 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
5554 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5555 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
5556 ; AVX2-NEXT: vzeroupper
5014 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5015 ; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
55575016 ; AVX2-NEXT: retq
55585017 ;
55595018 ; AVX512F-LABEL: mul_add_const_v4i64_v4i32:
55605019 ; AVX512F: # BB#0:
5561 ; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
5562 ; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1
5563 ; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
5564 ; AVX512F-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
5565 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
5566 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0
5020 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5021 ; AVX512F-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
55675022 ; AVX512F-NEXT: retq
55685023 ;
55695024 ; AVX512BW-LABEL: mul_add_const_v4i64_v4i32:
55705025 ; AVX512BW: # BB#0:
5571 ; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0
5572 ; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1
5573 ; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
5574 ; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
5575 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
5576 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0
5026 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5027 ; AVX512BW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
55775028 ; AVX512BW-NEXT: retq
55785029 ;
55795030 ; AVX512DQ-LABEL: mul_add_const_v4i64_v4i32:
55805031 ; AVX512DQ: # BB#0:
5581 ; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0
5582 ; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1
5032 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
5033 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
55835034 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
5584 ; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
55855035 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
5586 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0
5036 ; AVX512DQ-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
55875037 ; AVX512DQ-NEXT: retq
55885038 %1 = sext <4 x i32> %a0 to <4 x i64>
55895039 %2 = sext <4 x i32> %a1 to <4 x i64>
55965046 define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
55975047 ; SSE-LABEL: mul_add_self_v4i64_v4i32:
55985048 ; SSE: # BB#0:
5599 ; SSE-NEXT: movdqa %xmm0, %xmm2
5600 ; SSE-NEXT: psrad $31, %xmm2
5601 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
5602 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
5603 ; SSE-NEXT: movdqa %xmm3, %xmm2
5604 ; SSE-NEXT: psrad $31, %xmm2
5605 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5049 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
5050 ; SSE-NEXT: movdqa %xmm2, %xmm3
5051 ; SSE-NEXT: psrad $31, %xmm3
5052 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
5053 ; SSE-NEXT: movdqa %xmm0, %xmm3
5054 ; SSE-NEXT: psrad $31, %xmm3
5055 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
5056 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
5057 ; SSE-NEXT: movdqa %xmm3, %xmm4
5058 ; SSE-NEXT: psrad $31, %xmm4
5059 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
56065060 ; SSE-NEXT: movdqa %xmm1, %xmm4
56075061 ; SSE-NEXT: psrad $31, %xmm4
5608 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
56095062 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
5610 ; SSE-NEXT: movdqa %xmm2, %xmm4
5611 ; SSE-NEXT: psrad $31, %xmm4
5612 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
5063 ; SSE-NEXT: movdqa %xmm0, %xmm4
5064 ; SSE-NEXT: psrlq $32, %xmm4
5065 ; SSE-NEXT: pmuludq %xmm1, %xmm4
5066 ; SSE-NEXT: movdqa %xmm1, %xmm5
5067 ; SSE-NEXT: psrlq $32, %xmm5
5068 ; SSE-NEXT: pmuludq %xmm0, %xmm5
5069 ; SSE-NEXT: paddq %xmm4, %xmm5
5070 ; SSE-NEXT: psllq $32, %xmm5
5071 ; SSE-NEXT: pmuludq %xmm0, %xmm1
5072 ; SSE-NEXT: paddq %xmm5, %xmm1
5073 ; SSE-NEXT: movdqa %xmm2, %xmm0
5074 ; SSE-NEXT: psrlq $32, %xmm0
5075 ; SSE-NEXT: pmuludq %xmm3, %xmm0
56135076 ; SSE-NEXT: movdqa %xmm3, %xmm4
56145077 ; SSE-NEXT: psrlq $32, %xmm4
56155078 ; SSE-NEXT: pmuludq %xmm2, %xmm4
5616 ; SSE-NEXT: movdqa %xmm2, %xmm5
5617 ; SSE-NEXT: psrlq $32, %xmm5
5618 ; SSE-NEXT: pmuludq %xmm3, %xmm5
5619 ; SSE-NEXT: paddq %xmm4, %xmm5
5620 ; SSE-NEXT: psllq $32, %xmm5
5621 ; SSE-NEXT: pmuludq %xmm3, %xmm2
5622 ; SSE-NEXT: paddq %xmm5, %xmm2
5623 ; SSE-NEXT: movdqa %xmm0, %xmm3
5624 ; SSE-NEXT: psrlq $32, %xmm3
5625 ; SSE-NEXT: pmuludq %xmm1, %xmm3
5626 ; SSE-NEXT: movdqa %xmm1, %xmm4
5627 ; SSE-NEXT: psrlq $32, %xmm4
5628 ; SSE-NEXT: pmuludq %xmm0, %xmm4
5629 ; SSE-NEXT: paddq %xmm3, %xmm4
5079 ; SSE-NEXT: paddq %xmm0, %xmm4
56305080 ; SSE-NEXT: psllq $32, %xmm4
5631 ; SSE-NEXT: pmuludq %xmm0, %xmm1
5632 ; SSE-NEXT: paddq %xmm4, %xmm1
5633 ; SSE-NEXT: paddq %xmm1, %xmm1
5634 ; SSE-NEXT: paddq %xmm2, %xmm2
5635 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
5636 ; SSE-NEXT: movaps %xmm1, %xmm0
5081 ; SSE-NEXT: pmuludq %xmm2, %xmm3
5082 ; SSE-NEXT: paddq %xmm4, %xmm3
5083 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
5084 ; SSE-NEXT: paddd %xmm1, %xmm1
5085 ; SSE-NEXT: movdqa %xmm1, %xmm0
56375086 ; SSE-NEXT: retq
56385087 ;
56395088 ; AVX1-LABEL: mul_add_self_v4i64_v4i32:
56405089 ; AVX1: # BB#0:
5641 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
5642 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
5643 ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
5644 ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3
5645 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
5646 ; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
5647 ; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
5648 ; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1
5649 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
5650 ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
5651 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
5090 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5091 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
56525092 ; AVX1-NEXT: retq
56535093 ;
56545094 ; AVX2-LABEL: mul_add_self_v4i64_v4i32:
56555095 ; AVX2: # BB#0:
5656 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
5657 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
5658 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
5659 ; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0
5660 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
5661 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
5662 ; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0
5663 ; AVX2-NEXT: vzeroupper
5096 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5097 ; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
56645098 ; AVX2-NEXT: retq
56655099 ;
56665100 ; AVX512F-LABEL: mul_add_self_v4i64_v4i32:
56675101 ; AVX512F: # BB#0:
5668 ; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
5669 ; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1
5670 ; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
5671 ; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0
5672 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
5673 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0
5102 ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5103 ; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0
56745104 ; AVX512F-NEXT: retq
56755105 ;
56765106 ; AVX512BW-LABEL: mul_add_self_v4i64_v4i32:
56775107 ; AVX512BW: # BB#0:
5678 ; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0
5679 ; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1
5680 ; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
5681 ; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0
5682 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
5683 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0
5108 ; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
5109 ; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
56845110 ; AVX512BW-NEXT: retq
56855111 ;
56865112 ; AVX512DQ-LABEL: mul_add_self_v4i64_v4i32:
56885114 ; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0
56895115 ; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1
56905116 ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
5691 ; AVX512DQ-NEXT: vpaddq %ymm0, %ymm0, %ymm0
56925117 ; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
5693 ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0
5118 ; AVX512DQ-NEXT: vpaddd %xmm0, %xmm0, %xmm0
56945119 ; AVX512DQ-NEXT: retq
56955120 %1 = sext <4 x i32> %a0 to <4 x i64>
56965121 %2 = sext <4 x i32> %a1 to <4 x i64>