llvm.org GIT mirror llvm / 2d0f206
[SelectionDAG][X86] Use stack load/store in PromoteIntRes_BITCAST when the input needs to be be split and the output type is a vector. We had special case handling here, but it uses a scalar any_extend for the promotion then bitcasts to the final type. This won't split up the input data into multiple promoted elements like we need. This patch falls back to doing the conversion through memory. Fixes PR41594 which I believe was reflected in the bitcast-vector-bool.ll changes. The changes to vector-half-conversions.ll are fixing a previously unknown miscompile from this issue. Differential Revision: https://reviews.llvm.org/D61114 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359219 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 1 year, 5 months ago
3 changed file(s) with 1006 addition(s) and 1089 deletion(s). Raw diff Collapse all Expand all
305305 BitConvertToInteger(GetScalarizedVector(InOp)));
306306 break;
307307 case TargetLowering::TypeSplitVector: {
308 // For example, i32 = BITCAST v2i16 on alpha. Convert the split
309 // pieces of the input into integers and reassemble in the final type.
310 SDValue Lo, Hi;
311 GetSplitVector(N->getOperand(0), Lo, Hi);
312 Lo = BitConvertToInteger(Lo);
313 Hi = BitConvertToInteger(Hi);
314
315 if (DAG.getDataLayout().isBigEndian())
316 std::swap(Lo, Hi);
317
318 InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
319 EVT::getIntegerVT(*DAG.getContext(),
320 NOutVT.getSizeInBits()),
321 JoinIntegers(Lo, Hi));
322 return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp);
308 if (!NOutVT.isVector()) {
309 // For example, i32 = BITCAST v2i16 on alpha. Convert the split
310 // pieces of the input into integers and reassemble in the final type.
311 SDValue Lo, Hi;
312 GetSplitVector(N->getOperand(0), Lo, Hi);
313 Lo = BitConvertToInteger(Lo);
314 Hi = BitConvertToInteger(Hi);
315
316 if (DAG.getDataLayout().isBigEndian())
317 std::swap(Lo, Hi);
318
319 InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
320 EVT::getIntegerVT(*DAG.getContext(),
321 NOutVT.getSizeInBits()),
322 JoinIntegers(Lo, Hi));
323 return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp);
324 }
325 break;
323326 }
324327 case TargetLowering::TypeWidenVector:
325328 // The input is widened to the same size. Convert to the widened value.
749749 define i32 @bitcast_v64i8_to_v2i32(<64 x i8> %a0) nounwind {
750750 ; SSE2-SSSE3-LABEL: bitcast_v64i8_to_v2i32:
751751 ; SSE2-SSSE3: # %bb.0:
752 ; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm4
753 ; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm5
754 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm5
755 ; SSE2-SSSE3-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp)
756 ; SSE2-SSSE3-NEXT: pxor %xmm3, %xmm3
757 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm3
758 ; SSE2-SSSE3-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
759 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm2
760 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm2
761 ; SSE2-SSSE3-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
762 ; SSE2-SSSE3-NEXT: pcmpgtb %xmm0, %xmm4
763 ; SSE2-SSSE3-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
764 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
765 ; SSE2-SSSE3-NEXT: andl $1, %eax
766 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
767 ; SSE2-SSSE3-NEXT: andl $1, %ecx
768 ; SSE2-SSSE3-NEXT: leal (%rcx,%rax,2), %eax
769 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
770 ; SSE2-SSSE3-NEXT: andl $1, %ecx
771 ; SSE2-SSSE3-NEXT: leal (%rax,%rcx,4), %eax
772 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
773 ; SSE2-SSSE3-NEXT: andl $1, %ecx
774 ; SSE2-SSSE3-NEXT: leal (%rax,%rcx,8), %eax
775 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
776 ; SSE2-SSSE3-NEXT: andl $1, %ecx
777 ; SSE2-SSSE3-NEXT: shll $4, %ecx
778 ; SSE2-SSSE3-NEXT: orl %eax, %ecx
779 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
780 ; SSE2-SSSE3-NEXT: andl $1, %eax
781 ; SSE2-SSSE3-NEXT: shll $5, %eax
782 ; SSE2-SSSE3-NEXT: orl %ecx, %eax
783 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
784 ; SSE2-SSSE3-NEXT: andl $1, %ecx
785 ; SSE2-SSSE3-NEXT: shll $6, %ecx
786 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
787 ; SSE2-SSSE3-NEXT: andl $1, %edx
788 ; SSE2-SSSE3-NEXT: shll $7, %edx
789 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
790 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
791 ; SSE2-SSSE3-NEXT: andl $1, %ecx
792 ; SSE2-SSSE3-NEXT: shll $8, %ecx
793 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
794 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
795 ; SSE2-SSSE3-NEXT: andl $1, %edx
796 ; SSE2-SSSE3-NEXT: shll $9, %edx
797 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
798 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
799 ; SSE2-SSSE3-NEXT: andl $1, %ecx
800 ; SSE2-SSSE3-NEXT: shll $10, %ecx
801 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
802 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
803 ; SSE2-SSSE3-NEXT: andl $1, %edx
804 ; SSE2-SSSE3-NEXT: shll $11, %edx
805 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
806 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
807 ; SSE2-SSSE3-NEXT: andl $1, %ecx
808 ; SSE2-SSSE3-NEXT: shll $12, %ecx
809 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
810 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
811 ; SSE2-SSSE3-NEXT: andl $1, %edx
812 ; SSE2-SSSE3-NEXT: shll $13, %edx
813 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
814 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
815 ; SSE2-SSSE3-NEXT: andl $1, %ecx
816 ; SSE2-SSSE3-NEXT: shll $14, %ecx
817 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
818 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
819 ; SSE2-SSSE3-NEXT: shll $15, %edx
820 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
821 ; SSE2-SSSE3-NEXT: orl %eax, %edx
822 ; SSE2-SSSE3-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
823 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
824 ; SSE2-SSSE3-NEXT: andl $1, %eax
825 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
826 ; SSE2-SSSE3-NEXT: andl $1, %ecx
827 ; SSE2-SSSE3-NEXT: leal (%rcx,%rax,2), %eax
828 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
829 ; SSE2-SSSE3-NEXT: andl $1, %ecx
830 ; SSE2-SSSE3-NEXT: leal (%rax,%rcx,4), %eax
831 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
832 ; SSE2-SSSE3-NEXT: andl $1, %ecx
833 ; SSE2-SSSE3-NEXT: leal (%rax,%rcx,8), %eax
834 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
835 ; SSE2-SSSE3-NEXT: andl $1, %ecx
836 ; SSE2-SSSE3-NEXT: shll $4, %ecx
837 ; SSE2-SSSE3-NEXT: orl %eax, %ecx
838 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
839 ; SSE2-SSSE3-NEXT: andl $1, %eax
840 ; SSE2-SSSE3-NEXT: shll $5, %eax
841 ; SSE2-SSSE3-NEXT: orl %ecx, %eax
842 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
843 ; SSE2-SSSE3-NEXT: andl $1, %ecx
844 ; SSE2-SSSE3-NEXT: shll $6, %ecx
845 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
846 ; SSE2-SSSE3-NEXT: andl $1, %edx
847 ; SSE2-SSSE3-NEXT: shll $7, %edx
848 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
849 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
850 ; SSE2-SSSE3-NEXT: andl $1, %ecx
851 ; SSE2-SSSE3-NEXT: shll $8, %ecx
852 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
853 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
854 ; SSE2-SSSE3-NEXT: andl $1, %edx
855 ; SSE2-SSSE3-NEXT: shll $9, %edx
856 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
857 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
858 ; SSE2-SSSE3-NEXT: andl $1, %ecx
859 ; SSE2-SSSE3-NEXT: shll $10, %ecx
860 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
861 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
862 ; SSE2-SSSE3-NEXT: andl $1, %edx
863 ; SSE2-SSSE3-NEXT: shll $11, %edx
864 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
865 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
866 ; SSE2-SSSE3-NEXT: andl $1, %ecx
867 ; SSE2-SSSE3-NEXT: shll $12, %ecx
868 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
869 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
870 ; SSE2-SSSE3-NEXT: andl $1, %edx
871 ; SSE2-SSSE3-NEXT: shll $13, %edx
872 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
873 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
874 ; SSE2-SSSE3-NEXT: andl $1, %ecx
875 ; SSE2-SSSE3-NEXT: shll $14, %ecx
876 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
877 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
878 ; SSE2-SSSE3-NEXT: shll $15, %edx
879 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
880 ; SSE2-SSSE3-NEXT: orl %eax, %edx
881 ; SSE2-SSSE3-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
882 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
883 ; SSE2-SSSE3-NEXT: andl $1, %eax
884 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
885 ; SSE2-SSSE3-NEXT: andl $1, %ecx
886 ; SSE2-SSSE3-NEXT: leal (%rcx,%rax,2), %eax
887 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
888 ; SSE2-SSSE3-NEXT: andl $1, %ecx
889 ; SSE2-SSSE3-NEXT: leal (%rax,%rcx,4), %eax
890 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
891 ; SSE2-SSSE3-NEXT: andl $1, %ecx
892 ; SSE2-SSSE3-NEXT: leal (%rax,%rcx,8), %eax
893 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
894 ; SSE2-SSSE3-NEXT: andl $1, %ecx
895 ; SSE2-SSSE3-NEXT: shll $4, %ecx
896 ; SSE2-SSSE3-NEXT: orl %eax, %ecx
897 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
898 ; SSE2-SSSE3-NEXT: andl $1, %eax
899 ; SSE2-SSSE3-NEXT: shll $5, %eax
900 ; SSE2-SSSE3-NEXT: orl %ecx, %eax
901 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
902 ; SSE2-SSSE3-NEXT: andl $1, %ecx
903 ; SSE2-SSSE3-NEXT: shll $6, %ecx
904 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
905 ; SSE2-SSSE3-NEXT: andl $1, %edx
906 ; SSE2-SSSE3-NEXT: shll $7, %edx
907 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
908 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
909 ; SSE2-SSSE3-NEXT: andl $1, %ecx
910 ; SSE2-SSSE3-NEXT: shll $8, %ecx
911 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
912 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
913 ; SSE2-SSSE3-NEXT: andl $1, %edx
914 ; SSE2-SSSE3-NEXT: shll $9, %edx
915 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
916 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
917 ; SSE2-SSSE3-NEXT: andl $1, %ecx
918 ; SSE2-SSSE3-NEXT: shll $10, %ecx
919 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
920 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
921 ; SSE2-SSSE3-NEXT: andl $1, %edx
922 ; SSE2-SSSE3-NEXT: shll $11, %edx
923 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
924 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
925 ; SSE2-SSSE3-NEXT: andl $1, %ecx
926 ; SSE2-SSSE3-NEXT: shll $12, %ecx
927 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
928 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
929 ; SSE2-SSSE3-NEXT: andl $1, %edx
930 ; SSE2-SSSE3-NEXT: shll $13, %edx
931 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
932 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
933 ; SSE2-SSSE3-NEXT: andl $1, %ecx
934 ; SSE2-SSSE3-NEXT: shll $14, %ecx
935 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
936 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
937 ; SSE2-SSSE3-NEXT: shll $15, %edx
938 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
939 ; SSE2-SSSE3-NEXT: orl %eax, %edx
940 ; SSE2-SSSE3-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
941 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
942 ; SSE2-SSSE3-NEXT: andl $1, %eax
943 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
944 ; SSE2-SSSE3-NEXT: andl $1, %ecx
945 ; SSE2-SSSE3-NEXT: leal (%rcx,%rax,2), %eax
946 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
947 ; SSE2-SSSE3-NEXT: andl $1, %ecx
948 ; SSE2-SSSE3-NEXT: leal (%rax,%rcx,4), %eax
949 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
950 ; SSE2-SSSE3-NEXT: andl $1, %ecx
951 ; SSE2-SSSE3-NEXT: leal (%rax,%rcx,8), %eax
952 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
953 ; SSE2-SSSE3-NEXT: andl $1, %ecx
954 ; SSE2-SSSE3-NEXT: shll $4, %ecx
955 ; SSE2-SSSE3-NEXT: orl %eax, %ecx
956 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
957 ; SSE2-SSSE3-NEXT: andl $1, %eax
958 ; SSE2-SSSE3-NEXT: shll $5, %eax
959 ; SSE2-SSSE3-NEXT: orl %ecx, %eax
960 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
961 ; SSE2-SSSE3-NEXT: andl $1, %ecx
962 ; SSE2-SSSE3-NEXT: shll $6, %ecx
963 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
964 ; SSE2-SSSE3-NEXT: andl $1, %edx
965 ; SSE2-SSSE3-NEXT: shll $7, %edx
966 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
967 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
968 ; SSE2-SSSE3-NEXT: andl $1, %ecx
969 ; SSE2-SSSE3-NEXT: shll $8, %ecx
970 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
971 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
972 ; SSE2-SSSE3-NEXT: andl $1, %edx
973 ; SSE2-SSSE3-NEXT: shll $9, %edx
974 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
975 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
976 ; SSE2-SSSE3-NEXT: andl $1, %ecx
977 ; SSE2-SSSE3-NEXT: shll $10, %ecx
978 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
979 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
980 ; SSE2-SSSE3-NEXT: andl $1, %edx
981 ; SSE2-SSSE3-NEXT: shll $11, %edx
982 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
983 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
984 ; SSE2-SSSE3-NEXT: andl $1, %ecx
985 ; SSE2-SSSE3-NEXT: shll $12, %ecx
986 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
987 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
988 ; SSE2-SSSE3-NEXT: andl $1, %edx
989 ; SSE2-SSSE3-NEXT: shll $13, %edx
990 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
991 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
992 ; SSE2-SSSE3-NEXT: andl $1, %ecx
993 ; SSE2-SSSE3-NEXT: shll $14, %ecx
994 ; SSE2-SSSE3-NEXT: orl %edx, %ecx
995 ; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
996 ; SSE2-SSSE3-NEXT: shll $15, %edx
997 ; SSE2-SSSE3-NEXT: orl %ecx, %edx
998 ; SSE2-SSSE3-NEXT: orl %eax, %edx
999 ; SSE2-SSSE3-NEXT: movw %dx, -{{[0-9]+}}(%rsp)
1000 ; SSE2-SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1001 ; SSE2-SSSE3-NEXT: movd %xmm0, %ecx
1002 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
1003 ; SSE2-SSSE3-NEXT: movd %xmm0, %eax
1004 ; SSE2-SSSE3-NEXT: addl %ecx, %eax
7521005 ; SSE2-SSSE3-NEXT: retq
7531006 ;
754 ; AVX12-LABEL: bitcast_v64i8_to_v2i32:
755 ; AVX12: # %bb.0:
756 ; AVX12-NEXT: retq
1007 ; AVX1-LABEL: bitcast_v64i8_to_v2i32:
1008 ; AVX1: # %bb.0:
1009 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
1010 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3
1011 ; AVX1-NEXT: vpextrb $1, %xmm3, %eax
1012 ; AVX1-NEXT: andl $1, %eax
1013 ; AVX1-NEXT: vpextrb $0, %xmm3, %ecx
1014 ; AVX1-NEXT: andl $1, %ecx
1015 ; AVX1-NEXT: leal (%rcx,%rax,2), %eax
1016 ; AVX1-NEXT: vpextrb $2, %xmm3, %ecx
1017 ; AVX1-NEXT: andl $1, %ecx
1018 ; AVX1-NEXT: leal (%rax,%rcx,4), %eax
1019 ; AVX1-NEXT: vpextrb $3, %xmm3, %ecx
1020 ; AVX1-NEXT: andl $1, %ecx
1021 ; AVX1-NEXT: leal (%rax,%rcx,8), %eax
1022 ; AVX1-NEXT: vpextrb $4, %xmm3, %ecx
1023 ; AVX1-NEXT: andl $1, %ecx
1024 ; AVX1-NEXT: shll $4, %ecx
1025 ; AVX1-NEXT: orl %eax, %ecx
1026 ; AVX1-NEXT: vpextrb $5, %xmm3, %eax
1027 ; AVX1-NEXT: andl $1, %eax
1028 ; AVX1-NEXT: shll $5, %eax
1029 ; AVX1-NEXT: orl %ecx, %eax
1030 ; AVX1-NEXT: vpextrb $6, %xmm3, %ecx
1031 ; AVX1-NEXT: andl $1, %ecx
1032 ; AVX1-NEXT: shll $6, %ecx
1033 ; AVX1-NEXT: vpextrb $7, %xmm3, %edx
1034 ; AVX1-NEXT: andl $1, %edx
1035 ; AVX1-NEXT: shll $7, %edx
1036 ; AVX1-NEXT: orl %ecx, %edx
1037 ; AVX1-NEXT: vpextrb $8, %xmm3, %ecx
1038 ; AVX1-NEXT: andl $1, %ecx
1039 ; AVX1-NEXT: shll $8, %ecx
1040 ; AVX1-NEXT: orl %edx, %ecx
1041 ; AVX1-NEXT: vpextrb $9, %xmm3, %edx
1042 ; AVX1-NEXT: andl $1, %edx
1043 ; AVX1-NEXT: shll $9, %edx
1044 ; AVX1-NEXT: orl %ecx, %edx
1045 ; AVX1-NEXT: vpextrb $10, %xmm3, %ecx
1046 ; AVX1-NEXT: andl $1, %ecx
1047 ; AVX1-NEXT: shll $10, %ecx
1048 ; AVX1-NEXT: orl %edx, %ecx
1049 ; AVX1-NEXT: vpextrb $11, %xmm3, %edx
1050 ; AVX1-NEXT: andl $1, %edx
1051 ; AVX1-NEXT: shll $11, %edx
1052 ; AVX1-NEXT: orl %ecx, %edx
1053 ; AVX1-NEXT: vpextrb $12, %xmm3, %ecx
1054 ; AVX1-NEXT: andl $1, %ecx
1055 ; AVX1-NEXT: shll $12, %ecx
1056 ; AVX1-NEXT: orl %edx, %ecx
1057 ; AVX1-NEXT: vpextrb $13, %xmm3, %edx
1058 ; AVX1-NEXT: andl $1, %edx
1059 ; AVX1-NEXT: shll $13, %edx
1060 ; AVX1-NEXT: orl %ecx, %edx
1061 ; AVX1-NEXT: vpextrb $14, %xmm3, %ecx
1062 ; AVX1-NEXT: andl $1, %ecx
1063 ; AVX1-NEXT: shll $14, %ecx
1064 ; AVX1-NEXT: orl %edx, %ecx
1065 ; AVX1-NEXT: vpextrb $15, %xmm3, %edx
1066 ; AVX1-NEXT: andl $1, %edx
1067 ; AVX1-NEXT: shll $15, %edx
1068 ; AVX1-NEXT: orl %ecx, %edx
1069 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
1070 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
1071 ; AVX1-NEXT: vpextrb $0, %xmm1, %ecx
1072 ; AVX1-NEXT: andl $1, %ecx
1073 ; AVX1-NEXT: shll $16, %ecx
1074 ; AVX1-NEXT: orl %edx, %ecx
1075 ; AVX1-NEXT: vpextrb $1, %xmm1, %edx
1076 ; AVX1-NEXT: andl $1, %edx
1077 ; AVX1-NEXT: shll $17, %edx
1078 ; AVX1-NEXT: orl %ecx, %edx
1079 ; AVX1-NEXT: vpextrb $2, %xmm1, %ecx
1080 ; AVX1-NEXT: andl $1, %ecx
1081 ; AVX1-NEXT: shll $18, %ecx
1082 ; AVX1-NEXT: orl %edx, %ecx
1083 ; AVX1-NEXT: vpextrb $3, %xmm1, %edx
1084 ; AVX1-NEXT: andl $1, %edx
1085 ; AVX1-NEXT: shll $19, %edx
1086 ; AVX1-NEXT: orl %ecx, %edx
1087 ; AVX1-NEXT: vpextrb $4, %xmm1, %ecx
1088 ; AVX1-NEXT: andl $1, %ecx
1089 ; AVX1-NEXT: shll $20, %ecx
1090 ; AVX1-NEXT: orl %edx, %ecx
1091 ; AVX1-NEXT: vpextrb $5, %xmm1, %edx
1092 ; AVX1-NEXT: andl $1, %edx
1093 ; AVX1-NEXT: shll $21, %edx
1094 ; AVX1-NEXT: orl %ecx, %edx
1095 ; AVX1-NEXT: vpextrb $6, %xmm1, %ecx
1096 ; AVX1-NEXT: andl $1, %ecx
1097 ; AVX1-NEXT: shll $22, %ecx
1098 ; AVX1-NEXT: orl %edx, %ecx
1099 ; AVX1-NEXT: vpextrb $7, %xmm1, %edx
1100 ; AVX1-NEXT: andl $1, %edx
1101 ; AVX1-NEXT: shll $23, %edx
1102 ; AVX1-NEXT: orl %ecx, %edx
1103 ; AVX1-NEXT: vpextrb $8, %xmm1, %ecx
1104 ; AVX1-NEXT: andl $1, %ecx
1105 ; AVX1-NEXT: shll $24, %ecx
1106 ; AVX1-NEXT: orl %edx, %ecx
1107 ; AVX1-NEXT: vpextrb $9, %xmm1, %edx
1108 ; AVX1-NEXT: andl $1, %edx
1109 ; AVX1-NEXT: shll $25, %edx
1110 ; AVX1-NEXT: orl %ecx, %edx
1111 ; AVX1-NEXT: vpextrb $10, %xmm1, %ecx
1112 ; AVX1-NEXT: andl $1, %ecx
1113 ; AVX1-NEXT: shll $26, %ecx
1114 ; AVX1-NEXT: orl %edx, %ecx
1115 ; AVX1-NEXT: vpextrb $11, %xmm1, %edx
1116 ; AVX1-NEXT: andl $1, %edx
1117 ; AVX1-NEXT: shll $27, %edx
1118 ; AVX1-NEXT: orl %ecx, %edx
1119 ; AVX1-NEXT: vpextrb $12, %xmm1, %ecx
1120 ; AVX1-NEXT: andl $1, %ecx
1121 ; AVX1-NEXT: shll $28, %ecx
1122 ; AVX1-NEXT: orl %edx, %ecx
1123 ; AVX1-NEXT: vpextrb $13, %xmm1, %edx
1124 ; AVX1-NEXT: andl $1, %edx
1125 ; AVX1-NEXT: shll $29, %edx
1126 ; AVX1-NEXT: orl %ecx, %edx
1127 ; AVX1-NEXT: vpextrb $14, %xmm1, %ecx
1128 ; AVX1-NEXT: andl $1, %ecx
1129 ; AVX1-NEXT: shll $30, %ecx
1130 ; AVX1-NEXT: orl %edx, %ecx
1131 ; AVX1-NEXT: vpextrb $15, %xmm1, %edx
1132 ; AVX1-NEXT: shll $31, %edx
1133 ; AVX1-NEXT: orl %ecx, %edx
1134 ; AVX1-NEXT: orl %eax, %edx
1135 ; AVX1-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
1136 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm1
1137 ; AVX1-NEXT: vpextrb $1, %xmm1, %eax
1138 ; AVX1-NEXT: andl $1, %eax
1139 ; AVX1-NEXT: vpextrb $0, %xmm1, %ecx
1140 ; AVX1-NEXT: andl $1, %ecx
1141 ; AVX1-NEXT: leal (%rcx,%rax,2), %eax
1142 ; AVX1-NEXT: vpextrb $2, %xmm1, %ecx
1143 ; AVX1-NEXT: andl $1, %ecx
1144 ; AVX1-NEXT: leal (%rax,%rcx,4), %eax
1145 ; AVX1-NEXT: vpextrb $3, %xmm1, %ecx
1146 ; AVX1-NEXT: andl $1, %ecx
1147 ; AVX1-NEXT: leal (%rax,%rcx,8), %eax
1148 ; AVX1-NEXT: vpextrb $4, %xmm1, %ecx
1149 ; AVX1-NEXT: andl $1, %ecx
1150 ; AVX1-NEXT: shll $4, %ecx
1151 ; AVX1-NEXT: orl %eax, %ecx
1152 ; AVX1-NEXT: vpextrb $5, %xmm1, %eax
1153 ; AVX1-NEXT: andl $1, %eax
1154 ; AVX1-NEXT: shll $5, %eax
1155 ; AVX1-NEXT: orl %ecx, %eax
1156 ; AVX1-NEXT: vpextrb $6, %xmm1, %ecx
1157 ; AVX1-NEXT: andl $1, %ecx
1158 ; AVX1-NEXT: shll $6, %ecx
1159 ; AVX1-NEXT: vpextrb $7, %xmm1, %edx
1160 ; AVX1-NEXT: andl $1, %edx
1161 ; AVX1-NEXT: shll $7, %edx
1162 ; AVX1-NEXT: orl %ecx, %edx
1163 ; AVX1-NEXT: vpextrb $8, %xmm1, %ecx
1164 ; AVX1-NEXT: andl $1, %ecx
1165 ; AVX1-NEXT: shll $8, %ecx
1166 ; AVX1-NEXT: orl %edx, %ecx
1167 ; AVX1-NEXT: vpextrb $9, %xmm1, %edx
1168 ; AVX1-NEXT: andl $1, %edx
1169 ; AVX1-NEXT: shll $9, %edx
1170 ; AVX1-NEXT: orl %ecx, %edx
1171 ; AVX1-NEXT: vpextrb $10, %xmm1, %ecx
1172 ; AVX1-NEXT: andl $1, %ecx
1173 ; AVX1-NEXT: shll $10, %ecx
1174 ; AVX1-NEXT: orl %edx, %ecx
1175 ; AVX1-NEXT: vpextrb $11, %xmm1, %edx
1176 ; AVX1-NEXT: andl $1, %edx
1177 ; AVX1-NEXT: shll $11, %edx
1178 ; AVX1-NEXT: orl %ecx, %edx
1179 ; AVX1-NEXT: vpextrb $12, %xmm1, %ecx
1180 ; AVX1-NEXT: andl $1, %ecx
1181 ; AVX1-NEXT: shll $12, %ecx
1182 ; AVX1-NEXT: orl %edx, %ecx
1183 ; AVX1-NEXT: vpextrb $13, %xmm1, %edx
1184 ; AVX1-NEXT: andl $1, %edx
1185 ; AVX1-NEXT: shll $13, %edx
1186 ; AVX1-NEXT: orl %ecx, %edx
1187 ; AVX1-NEXT: vpextrb $14, %xmm1, %ecx
1188 ; AVX1-NEXT: andl $1, %ecx
1189 ; AVX1-NEXT: shll $14, %ecx
1190 ; AVX1-NEXT: orl %edx, %ecx
1191 ; AVX1-NEXT: vpextrb $15, %xmm1, %edx
1192 ; AVX1-NEXT: andl $1, %edx
1193 ; AVX1-NEXT: shll $15, %edx
1194 ; AVX1-NEXT: orl %ecx, %edx
1195 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1196 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
1197 ; AVX1-NEXT: vpextrb $0, %xmm0, %ecx
1198 ; AVX1-NEXT: andl $1, %ecx
1199 ; AVX1-NEXT: shll $16, %ecx
1200 ; AVX1-NEXT: orl %edx, %ecx
1201 ; AVX1-NEXT: vpextrb $1, %xmm0, %edx
1202 ; AVX1-NEXT: andl $1, %edx
1203 ; AVX1-NEXT: shll $17, %edx
1204 ; AVX1-NEXT: orl %ecx, %edx
1205 ; AVX1-NEXT: vpextrb $2, %xmm0, %ecx
1206 ; AVX1-NEXT: andl $1, %ecx
1207 ; AVX1-NEXT: shll $18, %ecx
1208 ; AVX1-NEXT: orl %edx, %ecx
1209 ; AVX1-NEXT: vpextrb $3, %xmm0, %edx
1210 ; AVX1-NEXT: andl $1, %edx
1211 ; AVX1-NEXT: shll $19, %edx
1212 ; AVX1-NEXT: orl %ecx, %edx
1213 ; AVX1-NEXT: vpextrb $4, %xmm0, %ecx
1214 ; AVX1-NEXT: andl $1, %ecx
1215 ; AVX1-NEXT: shll $20, %ecx
1216 ; AVX1-NEXT: orl %edx, %ecx
1217 ; AVX1-NEXT: vpextrb $5, %xmm0, %edx
1218 ; AVX1-NEXT: andl $1, %edx
1219 ; AVX1-NEXT: shll $21, %edx
1220 ; AVX1-NEXT: orl %ecx, %edx
1221 ; AVX1-NEXT: vpextrb $6, %xmm0, %ecx
1222 ; AVX1-NEXT: andl $1, %ecx
1223 ; AVX1-NEXT: shll $22, %ecx
1224 ; AVX1-NEXT: orl %edx, %ecx
1225 ; AVX1-NEXT: vpextrb $7, %xmm0, %edx
1226 ; AVX1-NEXT: andl $1, %edx
1227 ; AVX1-NEXT: shll $23, %edx
1228 ; AVX1-NEXT: orl %ecx, %edx
1229 ; AVX1-NEXT: vpextrb $8, %xmm0, %ecx
1230 ; AVX1-NEXT: andl $1, %ecx
1231 ; AVX1-NEXT: shll $24, %ecx
1232 ; AVX1-NEXT: orl %edx, %ecx
1233 ; AVX1-NEXT: vpextrb $9, %xmm0, %edx
1234 ; AVX1-NEXT: andl $1, %edx
1235 ; AVX1-NEXT: shll $25, %edx
1236 ; AVX1-NEXT: orl %ecx, %edx
1237 ; AVX1-NEXT: vpextrb $10, %xmm0, %ecx
1238 ; AVX1-NEXT: andl $1, %ecx
1239 ; AVX1-NEXT: shll $26, %ecx
1240 ; AVX1-NEXT: orl %edx, %ecx
1241 ; AVX1-NEXT: vpextrb $11, %xmm0, %edx
1242 ; AVX1-NEXT: andl $1, %edx
1243 ; AVX1-NEXT: shll $27, %edx
1244 ; AVX1-NEXT: orl %ecx, %edx
1245 ; AVX1-NEXT: vpextrb $12, %xmm0, %ecx
1246 ; AVX1-NEXT: andl $1, %ecx
1247 ; AVX1-NEXT: shll $28, %ecx
1248 ; AVX1-NEXT: orl %edx, %ecx
1249 ; AVX1-NEXT: vpextrb $13, %xmm0, %edx
1250 ; AVX1-NEXT: andl $1, %edx
1251 ; AVX1-NEXT: shll $29, %edx
1252 ; AVX1-NEXT: orl %ecx, %edx
1253 ; AVX1-NEXT: vpextrb $14, %xmm0, %ecx
1254 ; AVX1-NEXT: andl $1, %ecx
1255 ; AVX1-NEXT: shll $30, %ecx
1256 ; AVX1-NEXT: orl %edx, %ecx
1257 ; AVX1-NEXT: vpextrb $15, %xmm0, %edx
1258 ; AVX1-NEXT: shll $31, %edx
1259 ; AVX1-NEXT: orl %ecx, %edx
1260 ; AVX1-NEXT: orl %eax, %edx
1261 ; AVX1-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
1262 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1263 ; AVX1-NEXT: vmovd %xmm0, %ecx
1264 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax
1265 ; AVX1-NEXT: addl %ecx, %eax
1266 ; AVX1-NEXT: vzeroupper
1267 ; AVX1-NEXT: retq
1268 ;
1269 ; AVX2-LABEL: bitcast_v64i8_to_v2i32:
1270 ; AVX2: # %bb.0:
1271 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
1272 ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
1273 ; AVX2-NEXT: vpextrb $1, %xmm1, %eax
1274 ; AVX2-NEXT: andl $1, %eax
1275 ; AVX2-NEXT: vpextrb $0, %xmm1, %ecx
1276 ; AVX2-NEXT: andl $1, %ecx
1277 ; AVX2-NEXT: leal (%rcx,%rax,2), %eax
1278 ; AVX2-NEXT: vpextrb $2, %xmm1, %ecx
1279 ; AVX2-NEXT: andl $1, %ecx
1280 ; AVX2-NEXT: leal (%rax,%rcx,4), %eax
1281 ; AVX2-NEXT: vpextrb $3, %xmm1, %ecx
1282 ; AVX2-NEXT: andl $1, %ecx
1283 ; AVX2-NEXT: leal (%rax,%rcx,8), %eax
1284 ; AVX2-NEXT: vpextrb $4, %xmm1, %ecx
1285 ; AVX2-NEXT: andl $1, %ecx
1286 ; AVX2-NEXT: shll $4, %ecx
1287 ; AVX2-NEXT: orl %eax, %ecx
1288 ; AVX2-NEXT: vpextrb $5, %xmm1, %eax
1289 ; AVX2-NEXT: andl $1, %eax
1290 ; AVX2-NEXT: shll $5, %eax
1291 ; AVX2-NEXT: orl %ecx, %eax
1292 ; AVX2-NEXT: vpextrb $6, %xmm1, %ecx
1293 ; AVX2-NEXT: andl $1, %ecx
1294 ; AVX2-NEXT: shll $6, %ecx
1295 ; AVX2-NEXT: vpextrb $7, %xmm1, %edx
1296 ; AVX2-NEXT: andl $1, %edx
1297 ; AVX2-NEXT: shll $7, %edx
1298 ; AVX2-NEXT: orl %ecx, %edx
1299 ; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
1300 ; AVX2-NEXT: andl $1, %ecx
1301 ; AVX2-NEXT: shll $8, %ecx
1302 ; AVX2-NEXT: orl %edx, %ecx
1303 ; AVX2-NEXT: vpextrb $9, %xmm1, %edx
1304 ; AVX2-NEXT: andl $1, %edx
1305 ; AVX2-NEXT: shll $9, %edx
1306 ; AVX2-NEXT: orl %ecx, %edx
1307 ; AVX2-NEXT: vpextrb $10, %xmm1, %ecx
1308 ; AVX2-NEXT: andl $1, %ecx
1309 ; AVX2-NEXT: shll $10, %ecx
1310 ; AVX2-NEXT: orl %edx, %ecx
1311 ; AVX2-NEXT: vpextrb $11, %xmm1, %edx
1312 ; AVX2-NEXT: andl $1, %edx
1313 ; AVX2-NEXT: shll $11, %edx
1314 ; AVX2-NEXT: orl %ecx, %edx
1315 ; AVX2-NEXT: vpextrb $12, %xmm1, %ecx
1316 ; AVX2-NEXT: andl $1, %ecx
1317 ; AVX2-NEXT: shll $12, %ecx
1318 ; AVX2-NEXT: orl %edx, %ecx
1319 ; AVX2-NEXT: vpextrb $13, %xmm1, %edx
1320 ; AVX2-NEXT: andl $1, %edx
1321 ; AVX2-NEXT: shll $13, %edx
1322 ; AVX2-NEXT: orl %ecx, %edx
1323 ; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
1324 ; AVX2-NEXT: andl $1, %ecx
1325 ; AVX2-NEXT: shll $14, %ecx
1326 ; AVX2-NEXT: orl %edx, %ecx
1327 ; AVX2-NEXT: vpextrb $15, %xmm1, %edx
1328 ; AVX2-NEXT: andl $1, %edx
1329 ; AVX2-NEXT: shll $15, %edx
1330 ; AVX2-NEXT: orl %ecx, %edx
1331 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
1332 ; AVX2-NEXT: vpextrb $0, %xmm1, %ecx
1333 ; AVX2-NEXT: andl $1, %ecx
1334 ; AVX2-NEXT: shll $16, %ecx
1335 ; AVX2-NEXT: orl %edx, %ecx
1336 ; AVX2-NEXT: vpextrb $1, %xmm1, %edx
1337 ; AVX2-NEXT: andl $1, %edx
1338 ; AVX2-NEXT: shll $17, %edx
1339 ; AVX2-NEXT: orl %ecx, %edx
1340 ; AVX2-NEXT: vpextrb $2, %xmm1, %ecx
1341 ; AVX2-NEXT: andl $1, %ecx
1342 ; AVX2-NEXT: shll $18, %ecx
1343 ; AVX2-NEXT: orl %edx, %ecx
1344 ; AVX2-NEXT: vpextrb $3, %xmm1, %edx
1345 ; AVX2-NEXT: andl $1, %edx
1346 ; AVX2-NEXT: shll $19, %edx
1347 ; AVX2-NEXT: orl %ecx, %edx
1348 ; AVX2-NEXT: vpextrb $4, %xmm1, %ecx
1349 ; AVX2-NEXT: andl $1, %ecx
1350 ; AVX2-NEXT: shll $20, %ecx
1351 ; AVX2-NEXT: orl %edx, %ecx
1352 ; AVX2-NEXT: vpextrb $5, %xmm1, %edx
1353 ; AVX2-NEXT: andl $1, %edx
1354 ; AVX2-NEXT: shll $21, %edx
1355 ; AVX2-NEXT: orl %ecx, %edx
1356 ; AVX2-NEXT: vpextrb $6, %xmm1, %ecx
1357 ; AVX2-NEXT: andl $1, %ecx
1358 ; AVX2-NEXT: shll $22, %ecx
1359 ; AVX2-NEXT: orl %edx, %ecx
1360 ; AVX2-NEXT: vpextrb $7, %xmm1, %edx
1361 ; AVX2-NEXT: andl $1, %edx
1362 ; AVX2-NEXT: shll $23, %edx
1363 ; AVX2-NEXT: orl %ecx, %edx
1364 ; AVX2-NEXT: vpextrb $8, %xmm1, %ecx
1365 ; AVX2-NEXT: andl $1, %ecx
1366 ; AVX2-NEXT: shll $24, %ecx
1367 ; AVX2-NEXT: orl %edx, %ecx
1368 ; AVX2-NEXT: vpextrb $9, %xmm1, %edx
1369 ; AVX2-NEXT: andl $1, %edx
1370 ; AVX2-NEXT: shll $25, %edx
1371 ; AVX2-NEXT: orl %ecx, %edx
1372 ; AVX2-NEXT: vpextrb $10, %xmm1, %ecx
1373 ; AVX2-NEXT: andl $1, %ecx
1374 ; AVX2-NEXT: shll $26, %ecx
1375 ; AVX2-NEXT: orl %edx, %ecx
1376 ; AVX2-NEXT: vpextrb $11, %xmm1, %edx
1377 ; AVX2-NEXT: andl $1, %edx
1378 ; AVX2-NEXT: shll $27, %edx
1379 ; AVX2-NEXT: orl %ecx, %edx
1380 ; AVX2-NEXT: vpextrb $12, %xmm1, %ecx
1381 ; AVX2-NEXT: andl $1, %ecx
1382 ; AVX2-NEXT: shll $28, %ecx
1383 ; AVX2-NEXT: orl %edx, %ecx
1384 ; AVX2-NEXT: vpextrb $13, %xmm1, %edx
1385 ; AVX2-NEXT: andl $1, %edx
1386 ; AVX2-NEXT: shll $29, %edx
1387 ; AVX2-NEXT: orl %ecx, %edx
1388 ; AVX2-NEXT: vpextrb $14, %xmm1, %ecx
1389 ; AVX2-NEXT: andl $1, %ecx
1390 ; AVX2-NEXT: shll $30, %ecx
1391 ; AVX2-NEXT: orl %edx, %ecx
1392 ; AVX2-NEXT: vpextrb $15, %xmm1, %edx
1393 ; AVX2-NEXT: shll $31, %edx
1394 ; AVX2-NEXT: orl %ecx, %edx
1395 ; AVX2-NEXT: orl %eax, %edx
1396 ; AVX2-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
1397 ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
1398 ; AVX2-NEXT: vpextrb $1, %xmm0, %eax
1399 ; AVX2-NEXT: andl $1, %eax
1400 ; AVX2-NEXT: vpextrb $0, %xmm0, %ecx
1401 ; AVX2-NEXT: andl $1, %ecx
1402 ; AVX2-NEXT: leal (%rcx,%rax,2), %eax
1403 ; AVX2-NEXT: vpextrb $2, %xmm0, %ecx
1404 ; AVX2-NEXT: andl $1, %ecx
1405 ; AVX2-NEXT: leal (%rax,%rcx,4), %eax
1406 ; AVX2-NEXT: vpextrb $3, %xmm0, %ecx
1407 ; AVX2-NEXT: andl $1, %ecx
1408 ; AVX2-NEXT: leal (%rax,%rcx,8), %eax
1409 ; AVX2-NEXT: vpextrb $4, %xmm0, %ecx
1410 ; AVX2-NEXT: andl $1, %ecx
1411 ; AVX2-NEXT: shll $4, %ecx
1412 ; AVX2-NEXT: orl %eax, %ecx
1413 ; AVX2-NEXT: vpextrb $5, %xmm0, %eax
1414 ; AVX2-NEXT: andl $1, %eax
1415 ; AVX2-NEXT: shll $5, %eax
1416 ; AVX2-NEXT: orl %ecx, %eax
1417 ; AVX2-NEXT: vpextrb $6, %xmm0, %ecx
1418 ; AVX2-NEXT: andl $1, %ecx
1419 ; AVX2-NEXT: shll $6, %ecx
1420 ; AVX2-NEXT: vpextrb $7, %xmm0, %edx
1421 ; AVX2-NEXT: andl $1, %edx
1422 ; AVX2-NEXT: shll $7, %edx
1423 ; AVX2-NEXT: orl %ecx, %edx
1424 ; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
1425 ; AVX2-NEXT: andl $1, %ecx
1426 ; AVX2-NEXT: shll $8, %ecx
1427 ; AVX2-NEXT: orl %edx, %ecx
1428 ; AVX2-NEXT: vpextrb $9, %xmm0, %edx
1429 ; AVX2-NEXT: andl $1, %edx
1430 ; AVX2-NEXT: shll $9, %edx
1431 ; AVX2-NEXT: orl %ecx, %edx
1432 ; AVX2-NEXT: vpextrb $10, %xmm0, %ecx
1433 ; AVX2-NEXT: andl $1, %ecx
1434 ; AVX2-NEXT: shll $10, %ecx
1435 ; AVX2-NEXT: orl %edx, %ecx
1436 ; AVX2-NEXT: vpextrb $11, %xmm0, %edx
1437 ; AVX2-NEXT: andl $1, %edx
1438 ; AVX2-NEXT: shll $11, %edx
1439 ; AVX2-NEXT: orl %ecx, %edx
1440 ; AVX2-NEXT: vpextrb $12, %xmm0, %ecx
1441 ; AVX2-NEXT: andl $1, %ecx
1442 ; AVX2-NEXT: shll $12, %ecx
1443 ; AVX2-NEXT: orl %edx, %ecx
1444 ; AVX2-NEXT: vpextrb $13, %xmm0, %edx
1445 ; AVX2-NEXT: andl $1, %edx
1446 ; AVX2-NEXT: shll $13, %edx
1447 ; AVX2-NEXT: orl %ecx, %edx
1448 ; AVX2-NEXT: vpextrb $14, %xmm0, %ecx
1449 ; AVX2-NEXT: andl $1, %ecx
1450 ; AVX2-NEXT: shll $14, %ecx
1451 ; AVX2-NEXT: orl %edx, %ecx
1452 ; AVX2-NEXT: vpextrb $15, %xmm0, %edx
1453 ; AVX2-NEXT: andl $1, %edx
1454 ; AVX2-NEXT: shll $15, %edx
1455 ; AVX2-NEXT: orl %ecx, %edx
1456 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1457 ; AVX2-NEXT: vpextrb $0, %xmm0, %ecx
1458 ; AVX2-NEXT: andl $1, %ecx
1459 ; AVX2-NEXT: shll $16, %ecx
1460 ; AVX2-NEXT: orl %edx, %ecx
1461 ; AVX2-NEXT: vpextrb $1, %xmm0, %edx
1462 ; AVX2-NEXT: andl $1, %edx
1463 ; AVX2-NEXT: shll $17, %edx
1464 ; AVX2-NEXT: orl %ecx, %edx
1465 ; AVX2-NEXT: vpextrb $2, %xmm0, %ecx
1466 ; AVX2-NEXT: andl $1, %ecx
1467 ; AVX2-NEXT: shll $18, %ecx
1468 ; AVX2-NEXT: orl %edx, %ecx
1469 ; AVX2-NEXT: vpextrb $3, %xmm0, %edx
1470 ; AVX2-NEXT: andl $1, %edx
1471 ; AVX2-NEXT: shll $19, %edx
1472 ; AVX2-NEXT: orl %ecx, %edx
1473 ; AVX2-NEXT: vpextrb $4, %xmm0, %ecx
1474 ; AVX2-NEXT: andl $1, %ecx
1475 ; AVX2-NEXT: shll $20, %ecx
1476 ; AVX2-NEXT: orl %edx, %ecx
1477 ; AVX2-NEXT: vpextrb $5, %xmm0, %edx
1478 ; AVX2-NEXT: andl $1, %edx
1479 ; AVX2-NEXT: shll $21, %edx
1480 ; AVX2-NEXT: orl %ecx, %edx
1481 ; AVX2-NEXT: vpextrb $6, %xmm0, %ecx
1482 ; AVX2-NEXT: andl $1, %ecx
1483 ; AVX2-NEXT: shll $22, %ecx
1484 ; AVX2-NEXT: orl %edx, %ecx
1485 ; AVX2-NEXT: vpextrb $7, %xmm0, %edx
1486 ; AVX2-NEXT: andl $1, %edx
1487 ; AVX2-NEXT: shll $23, %edx
1488 ; AVX2-NEXT: orl %ecx, %edx
1489 ; AVX2-NEXT: vpextrb $8, %xmm0, %ecx
1490 ; AVX2-NEXT: andl $1, %ecx
1491 ; AVX2-NEXT: shll $24, %ecx
1492 ; AVX2-NEXT: orl %edx, %ecx
1493 ; AVX2-NEXT: vpextrb $9, %xmm0, %edx
1494 ; AVX2-NEXT: andl $1, %edx
1495 ; AVX2-NEXT: shll $25, %edx
1496 ; AVX2-NEXT: orl %ecx, %edx
1497 ; AVX2-NEXT: vpextrb $10, %xmm0, %ecx
1498 ; AVX2-NEXT: andl $1, %ecx
1499 ; AVX2-NEXT: shll $26, %ecx
1500 ; AVX2-NEXT: orl %edx, %ecx
1501 ; AVX2-NEXT: vpextrb $11, %xmm0, %edx
1502 ; AVX2-NEXT: andl $1, %edx
1503 ; AVX2-NEXT: shll $27, %edx
1504 ; AVX2-NEXT: orl %ecx, %edx
1505 ; AVX2-NEXT: vpextrb $12, %xmm0, %ecx
1506 ; AVX2-NEXT: andl $1, %ecx
1507 ; AVX2-NEXT: shll $28, %ecx
1508 ; AVX2-NEXT: orl %edx, %ecx
1509 ; AVX2-NEXT: vpextrb $13, %xmm0, %edx
1510 ; AVX2-NEXT: andl $1, %edx
1511 ; AVX2-NEXT: shll $29, %edx
1512 ; AVX2-NEXT: orl %ecx, %edx
1513 ; AVX2-NEXT: vpextrb $14, %xmm0, %ecx
1514 ; AVX2-NEXT: andl $1, %ecx
1515 ; AVX2-NEXT: shll $30, %ecx
1516 ; AVX2-NEXT: orl %edx, %ecx
1517 ; AVX2-NEXT: vpextrb $15, %xmm0, %edx
1518 ; AVX2-NEXT: shll $31, %edx
1519 ; AVX2-NEXT: orl %ecx, %edx
1520 ; AVX2-NEXT: orl %eax, %edx
1521 ; AVX2-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
1522 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1523 ; AVX2-NEXT: vmovd %xmm0, %ecx
1524 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax
1525 ; AVX2-NEXT: addl %ecx, %eax
1526 ; AVX2-NEXT: vzeroupper
1527 ; AVX2-NEXT: retq
7571528 ;
7581529 ; AVX512-LABEL: bitcast_v64i8_to_v2i32:
7591530 ; AVX512: # %bb.0:
15781578 define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
15791579 ; ALL-LABEL: cvt_4f32_to_4i16:
15801580 ; ALL: # %bb.0:
1581 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1582 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1581 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
15831582 ; ALL-NEXT: vmovd %xmm1, %eax
1584 ; ALL-NEXT: shll $16, %eax
1585 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1586 ; ALL-NEXT: vmovd %xmm1, %ecx
1587 ; ALL-NEXT: movzwl %cx, %ecx
1588 ; ALL-NEXT: orl %eax, %ecx
1583 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
15891584 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
15901585 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
15911586 ; ALL-NEXT: vmovd %xmm1, %eax
1592 ; ALL-NEXT: shll $16, %eax
1593 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1587 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1588 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1589 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1590 ; ALL-NEXT: vmovd %xmm1, %eax
1591 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1592 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
15941593 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1595 ; ALL-NEXT: vmovd %xmm0, %edx
1596 ; ALL-NEXT: movzwl %dx, %edx
1597 ; ALL-NEXT: orl %eax, %edx
1598 ; ALL-NEXT: shlq $32, %rdx
1599 ; ALL-NEXT: orq %rcx, %rdx
1600 ; ALL-NEXT: vmovq %rdx, %xmm0
1594 ; ALL-NEXT: vmovd %xmm0, %eax
1595 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1596 ; ALL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
16011597 ; ALL-NEXT: retq
16021598 %1 = fptrunc <4 x float> %a0 to <4 x half>
16031599 %2 = bitcast <4 x half> %1 to <4 x i16>
16071603 define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
16081604 ; ALL-LABEL: cvt_4f32_to_8i16_undef:
16091605 ; ALL: # %bb.0:
1610 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1611 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1606 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
16121607 ; ALL-NEXT: vmovd %xmm1, %eax
1613 ; ALL-NEXT: shll $16, %eax
1614 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1615 ; ALL-NEXT: vmovd %xmm1, %ecx
1616 ; ALL-NEXT: movzwl %cx, %ecx
1617 ; ALL-NEXT: orl %eax, %ecx
1608 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
16181609 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
16191610 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
16201611 ; ALL-NEXT: vmovd %xmm1, %eax
1621 ; ALL-NEXT: shll $16, %eax
1622 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1612 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1613 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1614 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1615 ; ALL-NEXT: vmovd %xmm1, %eax
1616 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1617 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
16231618 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1624 ; ALL-NEXT: vmovd %xmm0, %edx
1625 ; ALL-NEXT: movzwl %dx, %edx
1626 ; ALL-NEXT: orl %eax, %edx
1627 ; ALL-NEXT: shlq $32, %rdx
1628 ; ALL-NEXT: orq %rcx, %rdx
1629 ; ALL-NEXT: vmovq %rdx, %xmm0
1630 ; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1619 ; ALL-NEXT: vmovd %xmm0, %eax
1620 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1621 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
16311622 ; ALL-NEXT: retq
16321623 %1 = fptrunc <4 x float> %a0 to <4 x half>
16331624 %2 = bitcast <4 x half> %1 to <4 x i16>
16361627 }
16371628
16381629 define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
1639 ; AVX1-LABEL: cvt_4f32_to_8i16_zero:
1640 ; AVX1: # %bb.0:
1641 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1642 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1643 ; AVX1-NEXT: vmovd %xmm1, %eax
1644 ; AVX1-NEXT: shll $16, %eax
1645 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1646 ; AVX1-NEXT: vmovd %xmm1, %ecx
1647 ; AVX1-NEXT: movzwl %cx, %ecx
1648 ; AVX1-NEXT: orl %eax, %ecx
1649 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1650 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1651 ; AVX1-NEXT: vmovd %xmm1, %eax
1652 ; AVX1-NEXT: shll $16, %eax
1653 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1654 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1655 ; AVX1-NEXT: vmovd %xmm0, %edx
1656 ; AVX1-NEXT: movzwl %dx, %edx
1657 ; AVX1-NEXT: orl %eax, %edx
1658 ; AVX1-NEXT: shlq $32, %rdx
1659 ; AVX1-NEXT: orq %rcx, %rdx
1660 ; AVX1-NEXT: vmovq %rdx, %xmm0
1661 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1662 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1663 ; AVX1-NEXT: retq
1664 ;
1665 ; AVX2-SLOW-LABEL: cvt_4f32_to_8i16_zero:
1666 ; AVX2-SLOW: # %bb.0:
1667 ; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1668 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1669 ; AVX2-SLOW-NEXT: vmovd %xmm1, %eax
1670 ; AVX2-SLOW-NEXT: shll $16, %eax
1671 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1672 ; AVX2-SLOW-NEXT: vmovd %xmm1, %ecx
1673 ; AVX2-SLOW-NEXT: movzwl %cx, %ecx
1674 ; AVX2-SLOW-NEXT: orl %eax, %ecx
1675 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1676 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1677 ; AVX2-SLOW-NEXT: vmovd %xmm1, %eax
1678 ; AVX2-SLOW-NEXT: shll $16, %eax
1679 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1680 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1681 ; AVX2-SLOW-NEXT: vmovd %xmm0, %edx
1682 ; AVX2-SLOW-NEXT: movzwl %dx, %edx
1683 ; AVX2-SLOW-NEXT: orl %eax, %edx
1684 ; AVX2-SLOW-NEXT: shlq $32, %rdx
1685 ; AVX2-SLOW-NEXT: orq %rcx, %rdx
1686 ; AVX2-SLOW-NEXT: vmovq %rdx, %xmm0
1687 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1688 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1689 ; AVX2-SLOW-NEXT: retq
1690 ;
1691 ; AVX2-FAST-LABEL: cvt_4f32_to_8i16_zero:
1692 ; AVX2-FAST: # %bb.0:
1693 ; AVX2-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1694 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1695 ; AVX2-FAST-NEXT: vmovd %xmm1, %eax
1696 ; AVX2-FAST-NEXT: shll $16, %eax
1697 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1698 ; AVX2-FAST-NEXT: vmovd %xmm1, %ecx
1699 ; AVX2-FAST-NEXT: movzwl %cx, %ecx
1700 ; AVX2-FAST-NEXT: orl %eax, %ecx
1701 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1702 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1703 ; AVX2-FAST-NEXT: vmovd %xmm1, %eax
1704 ; AVX2-FAST-NEXT: shll $16, %eax
1705 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1706 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1707 ; AVX2-FAST-NEXT: vmovd %xmm0, %edx
1708 ; AVX2-FAST-NEXT: movzwl %dx, %edx
1709 ; AVX2-FAST-NEXT: orl %eax, %edx
1710 ; AVX2-FAST-NEXT: shlq $32, %rdx
1711 ; AVX2-FAST-NEXT: orq %rcx, %rdx
1712 ; AVX2-FAST-NEXT: vmovq %rdx, %xmm0
1713 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
1714 ; AVX2-FAST-NEXT: retq
1715 ;
1716 ; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
1717 ; AVX512F: # %bb.0:
1718 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1719 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1720 ; AVX512F-NEXT: vmovd %xmm1, %eax
1721 ; AVX512F-NEXT: shll $16, %eax
1722 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1723 ; AVX512F-NEXT: vmovd %xmm1, %ecx
1724 ; AVX512F-NEXT: movzwl %cx, %ecx
1725 ; AVX512F-NEXT: orl %eax, %ecx
1726 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1727 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1728 ; AVX512F-NEXT: vmovd %xmm1, %eax
1729 ; AVX512F-NEXT: shll $16, %eax
1730 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1731 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1732 ; AVX512F-NEXT: vmovd %xmm0, %edx
1733 ; AVX512F-NEXT: movzwl %dx, %edx
1734 ; AVX512F-NEXT: orl %eax, %edx
1735 ; AVX512F-NEXT: shlq $32, %rdx
1736 ; AVX512F-NEXT: orq %rcx, %rdx
1737 ; AVX512F-NEXT: vmovq %rdx, %xmm0
1738 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1739 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
1740 ; AVX512F-NEXT: retq
1741 ;
1742 ; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
1743 ; AVX512VL: # %bb.0:
1744 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1745 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1746 ; AVX512VL-NEXT: vmovd %xmm1, %eax
1747 ; AVX512VL-NEXT: shll $16, %eax
1748 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1749 ; AVX512VL-NEXT: vmovd %xmm1, %ecx
1750 ; AVX512VL-NEXT: movzwl %cx, %ecx
1751 ; AVX512VL-NEXT: orl %eax, %ecx
1752 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1753 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1754 ; AVX512VL-NEXT: vmovd %xmm1, %eax
1755 ; AVX512VL-NEXT: shll $16, %eax
1756 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1757 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1758 ; AVX512VL-NEXT: vmovd %xmm0, %edx
1759 ; AVX512VL-NEXT: movzwl %dx, %edx
1760 ; AVX512VL-NEXT: orl %eax, %edx
1761 ; AVX512VL-NEXT: shlq $32, %rdx
1762 ; AVX512VL-NEXT: orq %rcx, %rdx
1763 ; AVX512VL-NEXT: vmovq %rdx, %xmm0
1764 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
1765 ; AVX512VL-NEXT: retq
1630 ; ALL-LABEL: cvt_4f32_to_8i16_zero:
1631 ; ALL: # %bb.0:
1632 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1633 ; ALL-NEXT: vmovd %xmm1, %eax
1634 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1635 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1636 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1637 ; ALL-NEXT: vmovd %xmm1, %eax
1638 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1639 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1640 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1641 ; ALL-NEXT: vmovd %xmm1, %eax
1642 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1643 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1644 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1645 ; ALL-NEXT: vmovd %xmm0, %eax
1646 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1647 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1648 ; ALL-NEXT: retq
17661649 %1 = fptrunc <4 x float> %a0 to <4 x half>
17671650 %2 = bitcast <4 x half> %1 to <4 x i16>
17681651 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32>
20731956 define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind {
20741957 ; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
20751958 ; ALL: # %bb.0:
2076 ; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2077 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1959 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
20781960 ; ALL-NEXT: vmovd %xmm1, %eax
2079 ; ALL-NEXT: shll $16, %eax
2080 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2081 ; ALL-NEXT: vmovd %xmm1, %ecx
2082 ; ALL-NEXT: movzwl %cx, %ecx
2083 ; ALL-NEXT: orl %eax, %ecx
1961 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
20841962 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
20851963 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
20861964 ; ALL-NEXT: vmovd %xmm1, %eax
2087 ; ALL-NEXT: shll $16, %eax
2088 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1965 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1966 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1967 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1968 ; ALL-NEXT: vmovd %xmm1, %eax
1969 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1970 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
20891971 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2090 ; ALL-NEXT: vmovd %xmm0, %edx
2091 ; ALL-NEXT: movzwl %dx, %edx
2092 ; ALL-NEXT: orl %eax, %edx
2093 ; ALL-NEXT: shlq $32, %rdx
2094 ; ALL-NEXT: orq %rcx, %rdx
2095 ; ALL-NEXT: vmovq %rdx, %xmm0
2096 ; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2097 ; ALL-NEXT: vmovdqa %xmm0, (%rdi)
1972 ; ALL-NEXT: vmovd %xmm0, %eax
1973 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1974 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1975 ; ALL-NEXT: vmovaps %xmm0, (%rdi)
20981976 ; ALL-NEXT: retq
20991977 %1 = fptrunc <4 x float> %a0 to <4 x half>
21001978 %2 = bitcast <4 x half> %1 to <4 x i16>
21041982 }
21051983
21061984 define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
2107 ; AVX1-LABEL: store_cvt_4f32_to_8i16_zero:
2108 ; AVX1: # %bb.0:
2109 ; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2110 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2111 ; AVX1-NEXT: vmovd %xmm1, %eax
2112 ; AVX1-NEXT: shll $16, %eax
2113 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2114 ; AVX1-NEXT: vmovd %xmm1, %ecx
2115 ; AVX1-NEXT: movzwl %cx, %ecx
2116 ; AVX1-NEXT: orl %eax, %ecx
2117 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2118 ; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2119 ; AVX1-NEXT: vmovd %xmm1, %eax
2120 ; AVX1-NEXT: shll $16, %eax
2121 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2122 ; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2123 ; AVX1-NEXT: vmovd %xmm0, %edx
2124 ; AVX1-NEXT: movzwl %dx, %edx
2125 ; AVX1-NEXT: orl %eax, %edx
2126 ; AVX1-NEXT: shlq $32, %rdx
2127 ; AVX1-NEXT: orq %rcx, %rdx
2128 ; AVX1-NEXT: vmovq %rdx, %xmm0
2129 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2130 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2131 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
2132 ; AVX1-NEXT: retq
2133 ;
2134 ; AVX2-SLOW-LABEL: store_cvt_4f32_to_8i16_zero:
2135 ; AVX2-SLOW: # %bb.0:
2136 ; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2137 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2138 ; AVX2-SLOW-NEXT: vmovd %xmm1, %eax
2139 ; AVX2-SLOW-NEXT: shll $16, %eax
2140 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2141 ; AVX2-SLOW-NEXT: vmovd %xmm1, %ecx
2142 ; AVX2-SLOW-NEXT: movzwl %cx, %ecx
2143 ; AVX2-SLOW-NEXT: orl %eax, %ecx
2144 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2145 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2146 ; AVX2-SLOW-NEXT: vmovd %xmm1, %eax
2147 ; AVX2-SLOW-NEXT: shll $16, %eax
2148 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2149 ; AVX2-SLOW-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2150 ; AVX2-SLOW-NEXT: vmovd %xmm0, %edx
2151 ; AVX2-SLOW-NEXT: movzwl %dx, %edx
2152 ; AVX2-SLOW-NEXT: orl %eax, %edx
2153 ; AVX2-SLOW-NEXT: shlq $32, %rdx
2154 ; AVX2-SLOW-NEXT: orq %rcx, %rdx
2155 ; AVX2-SLOW-NEXT: vmovq %rdx, %xmm0
2156 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2157 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2158 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi)
2159 ; AVX2-SLOW-NEXT: retq
2160 ;
2161 ; AVX2-FAST-LABEL: store_cvt_4f32_to_8i16_zero:
2162 ; AVX2-FAST: # %bb.0:
2163 ; AVX2-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2164 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2165 ; AVX2-FAST-NEXT: vmovd %xmm1, %eax
2166 ; AVX2-FAST-NEXT: shll $16, %eax
2167 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2168 ; AVX2-FAST-NEXT: vmovd %xmm1, %ecx
2169 ; AVX2-FAST-NEXT: movzwl %cx, %ecx
2170 ; AVX2-FAST-NEXT: orl %eax, %ecx
2171 ; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2172 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2173 ; AVX2-FAST-NEXT: vmovd %xmm1, %eax
2174 ; AVX2-FAST-NEXT: shll $16, %eax
2175 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2176 ; AVX2-FAST-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2177 ; AVX2-FAST-NEXT: vmovd %xmm0, %edx
2178 ; AVX2-FAST-NEXT: movzwl %dx, %edx
2179 ; AVX2-FAST-NEXT: orl %eax, %edx
2180 ; AVX2-FAST-NEXT: shlq $32, %rdx
2181 ; AVX2-FAST-NEXT: orq %rcx, %rdx
2182 ; AVX2-FAST-NEXT: vmovq %rdx, %xmm0
2183 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
2184 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%rdi)
2185 ; AVX2-FAST-NEXT: retq
2186 ;
2187 ; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
2188 ; AVX512F: # %bb.0:
2189 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2190 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2191 ; AVX512F-NEXT: vmovd %xmm1, %eax
2192 ; AVX512F-NEXT: shll $16, %eax
2193 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2194 ; AVX512F-NEXT: vmovd %xmm1, %ecx
2195 ; AVX512F-NEXT: movzwl %cx, %ecx
2196 ; AVX512F-NEXT: orl %eax, %ecx
2197 ; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2198 ; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2199 ; AVX512F-NEXT: vmovd %xmm1, %eax
2200 ; AVX512F-NEXT: shll $16, %eax
2201 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2202 ; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2203 ; AVX512F-NEXT: vmovd %xmm0, %edx
2204 ; AVX512F-NEXT: movzwl %dx, %edx
2205 ; AVX512F-NEXT: orl %eax, %edx
2206 ; AVX512F-NEXT: shlq $32, %rdx
2207 ; AVX512F-NEXT: orq %rcx, %rdx
2208 ; AVX512F-NEXT: vmovq %rdx, %xmm0
2209 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2210 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2211 ; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
2212 ; AVX512F-NEXT: retq
2213 ;
2214 ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
2215 ; AVX512VL: # %bb.0:
2216 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2217 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2218 ; AVX512VL-NEXT: vmovd %xmm1, %eax
2219 ; AVX512VL-NEXT: shll $16, %eax
2220 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
2221 ; AVX512VL-NEXT: vmovd %xmm1, %ecx
2222 ; AVX512VL-NEXT: movzwl %cx, %ecx
2223 ; AVX512VL-NEXT: orl %eax, %ecx
2224 ; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
2225 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
2226 ; AVX512VL-NEXT: vmovd %xmm1, %eax
2227 ; AVX512VL-NEXT: shll $16, %eax
2228 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2229 ; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2230 ; AVX512VL-NEXT: vmovd %xmm0, %edx
2231 ; AVX512VL-NEXT: movzwl %dx, %edx
2232 ; AVX512VL-NEXT: orl %eax, %edx
2233 ; AVX512VL-NEXT: shlq $32, %rdx
2234 ; AVX512VL-NEXT: orq %rcx, %rdx
2235 ; AVX512VL-NEXT: vmovq %rdx, %xmm0
2236 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
2237 ; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi)
2238 ; AVX512VL-NEXT: retq
1985 ; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
1986 ; ALL: # %bb.0:
1987 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1988 ; ALL-NEXT: vmovd %xmm1, %eax
1989 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1990 ; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1991 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1992 ; ALL-NEXT: vmovd %xmm1, %eax
1993 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1994 ; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1995 ; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1996 ; ALL-NEXT: vmovd %xmm1, %eax
1997 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
1998 ; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1999 ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
2000 ; ALL-NEXT: vmovd %xmm0, %eax
2001 ; ALL-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
2002 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2003 ; ALL-NEXT: vmovaps %xmm0, (%rdi)
2004 ; ALL-NEXT: retq
22392005 %1 = fptrunc <4 x float> %a0 to <4 x half>
22402006 %2 = bitcast <4 x half> %1 to <4 x i16>
22412007 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32>
25092275 define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
25102276 ; ALL-LABEL: cvt_2f64_to_2i16:
25112277 ; ALL: # %bb.0:
2512 ; ALL-NEXT: pushq %rbx
2513 ; ALL-NEXT: subq $16, %rsp
2514 ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2515 ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2278 ; ALL-NEXT: subq $40, %rsp
2279 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
25162280 ; ALL-NEXT: callq __truncdfhf2
2517 ; ALL-NEXT: movl %eax, %ebx
2518 ; ALL-NEXT: shll $16, %ebx
2519 ; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2281 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2282 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2283 ; ALL-NEXT: # xmm0 = mem[1,0]
25202284 ; ALL-NEXT: callq __truncdfhf2
2521 ; ALL-NEXT: movzwl %ax, %eax
2522 ; ALL-NEXT: orl %ebx, %eax
2523 ; ALL-NEXT: vmovd %eax, %xmm0
2524 ; ALL-NEXT: addq $16, %rsp
2525 ; ALL-NEXT: popq %rbx
2285 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2286 ; ALL-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
2287 ; ALL-NEXT: addq $40, %rsp
25262288 ; ALL-NEXT: retq
25272289 %1 = fptrunc <2 x double> %a0 to <2 x half>
25282290 %2 = bitcast <2 x half> %1 to <2 x i16>
25302292 }
25312293
25322294 define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
2533 ; AVX1-LABEL: cvt_4f64_to_4i16:
2534 ; AVX1: # %bb.0:
2535 ; AVX1-NEXT: pushq %r14
2536 ; AVX1-NEXT: pushq %rbx
2537 ; AVX1-NEXT: subq $40, %rsp
2538 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2539 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2540 ; AVX1-NEXT: vzeroupper
2541 ; AVX1-NEXT: callq __truncdfhf2
2542 ; AVX1-NEXT: movl %eax, %ebx
2543 ; AVX1-NEXT: shll $16, %ebx
2544 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2545 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2546 ; AVX1-NEXT: vzeroupper
2547 ; AVX1-NEXT: callq __truncdfhf2
2548 ; AVX1-NEXT: movzwl %ax, %r14d
2549 ; AVX1-NEXT: orl %ebx, %r14d
2550 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2551 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2552 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2553 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2554 ; AVX1-NEXT: vzeroupper
2555 ; AVX1-NEXT: callq __truncdfhf2
2556 ; AVX1-NEXT: movl %eax, %ebx
2557 ; AVX1-NEXT: shll $16, %ebx
2558 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2559 ; AVX1-NEXT: callq __truncdfhf2
2560 ; AVX1-NEXT: movzwl %ax, %eax
2561 ; AVX1-NEXT: orl %ebx, %eax
2562 ; AVX1-NEXT: shlq $32, %rax
2563 ; AVX1-NEXT: orq %r14, %rax
2564 ; AVX1-NEXT: vmovq %rax, %xmm0
2565 ; AVX1-NEXT: addq $40, %rsp
2566 ; AVX1-NEXT: popq %rbx
2567 ; AVX1-NEXT: popq %r14
2568 ; AVX1-NEXT: retq
2569 ;
2570 ; AVX2-LABEL: cvt_4f64_to_4i16:
2571 ; AVX2: # %bb.0:
2572 ; AVX2-NEXT: pushq %r14
2573 ; AVX2-NEXT: pushq %rbx
2574 ; AVX2-NEXT: subq $40, %rsp
2575 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2576 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2577 ; AVX2-NEXT: vzeroupper
2578 ; AVX2-NEXT: callq __truncdfhf2
2579 ; AVX2-NEXT: movl %eax, %ebx
2580 ; AVX2-NEXT: shll $16, %ebx
2581 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2582 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2583 ; AVX2-NEXT: vzeroupper
2584 ; AVX2-NEXT: callq __truncdfhf2
2585 ; AVX2-NEXT: movzwl %ax, %r14d
2586 ; AVX2-NEXT: orl %ebx, %r14d
2587 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2588 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2589 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2590 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2591 ; AVX2-NEXT: vzeroupper
2592 ; AVX2-NEXT: callq __truncdfhf2
2593 ; AVX2-NEXT: movl %eax, %ebx
2594 ; AVX2-NEXT: shll $16, %ebx
2595 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2596 ; AVX2-NEXT: callq __truncdfhf2
2597 ; AVX2-NEXT: movzwl %ax, %eax
2598 ; AVX2-NEXT: orl %ebx, %eax
2599 ; AVX2-NEXT: shlq $32, %rax
2600 ; AVX2-NEXT: orq %r14, %rax
2601 ; AVX2-NEXT: vmovq %rax, %xmm0
2602 ; AVX2-NEXT: addq $40, %rsp
2603 ; AVX2-NEXT: popq %rbx
2604 ; AVX2-NEXT: popq %r14
2605 ; AVX2-NEXT: retq
2606 ;
2607 ; AVX512-LABEL: cvt_4f64_to_4i16:
2608 ; AVX512: # %bb.0:
2609 ; AVX512-NEXT: pushq %r14
2610 ; AVX512-NEXT: pushq %rbx
2611 ; AVX512-NEXT: subq $40, %rsp
2612 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2613 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2614 ; AVX512-NEXT: vzeroupper
2615 ; AVX512-NEXT: callq __truncdfhf2
2616 ; AVX512-NEXT: movl %eax, %ebx
2617 ; AVX512-NEXT: shll $16, %ebx
2618 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2619 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2620 ; AVX512-NEXT: vzeroupper
2621 ; AVX512-NEXT: callq __truncdfhf2
2622 ; AVX512-NEXT: movzwl %ax, %r14d
2623 ; AVX512-NEXT: orl %ebx, %r14d
2624 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2625 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2626 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2627 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2628 ; AVX512-NEXT: vzeroupper
2629 ; AVX512-NEXT: callq __truncdfhf2
2630 ; AVX512-NEXT: movl %eax, %ebx
2631 ; AVX512-NEXT: shll $16, %ebx
2632 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2633 ; AVX512-NEXT: callq __truncdfhf2
2634 ; AVX512-NEXT: movzwl %ax, %eax
2635 ; AVX512-NEXT: orl %ebx, %eax
2636 ; AVX512-NEXT: shlq $32, %rax
2637 ; AVX512-NEXT: orq %r14, %rax
2638 ; AVX512-NEXT: vmovq %rax, %xmm0
2639 ; AVX512-NEXT: addq $40, %rsp
2640 ; AVX512-NEXT: popq %rbx
2641 ; AVX512-NEXT: popq %r14
2642 ; AVX512-NEXT: retq
2295 ; ALL-LABEL: cvt_4f64_to_4i16:
2296 ; ALL: # %bb.0:
2297 ; ALL-NEXT: subq $88, %rsp
2298 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2299 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2300 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2301 ; ALL-NEXT: vzeroupper
2302 ; ALL-NEXT: callq __truncdfhf2
2303 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2304 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2305 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2306 ; ALL-NEXT: vzeroupper
2307 ; ALL-NEXT: callq __truncdfhf2
2308 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2309 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2310 ; ALL-NEXT: # xmm0 = mem[1,0]
2311 ; ALL-NEXT: callq __truncdfhf2
2312 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2313 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2314 ; ALL-NEXT: # xmm0 = mem[1,0]
2315 ; ALL-NEXT: callq __truncdfhf2
2316 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2317 ; ALL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2318 ; ALL-NEXT: addq $88, %rsp
2319 ; ALL-NEXT: retq
26432320 %1 = fptrunc <4 x double> %a0 to <4 x half>
26442321 %2 = bitcast <4 x half> %1 to <4 x i16>
26452322 ret <4 x i16> %2
26462323 }
26472324
26482325 define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
2649 ; AVX1-LABEL: cvt_4f64_to_8i16_undef:
2650 ; AVX1: # %bb.0:
2651 ; AVX1-NEXT: pushq %r14
2652 ; AVX1-NEXT: pushq %rbx
2653 ; AVX1-NEXT: subq $40, %rsp
2654 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2655 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2656 ; AVX1-NEXT: vzeroupper
2657 ; AVX1-NEXT: callq __truncdfhf2
2658 ; AVX1-NEXT: movl %eax, %ebx
2659 ; AVX1-NEXT: shll $16, %ebx
2660 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2661 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2662 ; AVX1-NEXT: vzeroupper
2663 ; AVX1-NEXT: callq __truncdfhf2
2664 ; AVX1-NEXT: movzwl %ax, %r14d
2665 ; AVX1-NEXT: orl %ebx, %r14d
2666 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2667 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2668 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2669 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2670 ; AVX1-NEXT: vzeroupper
2671 ; AVX1-NEXT: callq __truncdfhf2
2672 ; AVX1-NEXT: movl %eax, %ebx
2673 ; AVX1-NEXT: shll $16, %ebx
2674 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2675 ; AVX1-NEXT: callq __truncdfhf2
2676 ; AVX1-NEXT: movzwl %ax, %eax
2677 ; AVX1-NEXT: orl %ebx, %eax
2678 ; AVX1-NEXT: shlq $32, %rax
2679 ; AVX1-NEXT: orq %r14, %rax
2680 ; AVX1-NEXT: vmovq %rax, %xmm0
2681 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2682 ; AVX1-NEXT: addq $40, %rsp
2683 ; AVX1-NEXT: popq %rbx
2684 ; AVX1-NEXT: popq %r14
2685 ; AVX1-NEXT: retq
2686 ;
2687 ; AVX2-LABEL: cvt_4f64_to_8i16_undef:
2688 ; AVX2: # %bb.0:
2689 ; AVX2-NEXT: pushq %r14
2690 ; AVX2-NEXT: pushq %rbx
2691 ; AVX2-NEXT: subq $40, %rsp
2692 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2693 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2694 ; AVX2-NEXT: vzeroupper
2695 ; AVX2-NEXT: callq __truncdfhf2
2696 ; AVX2-NEXT: movl %eax, %ebx
2697 ; AVX2-NEXT: shll $16, %ebx
2698 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2699 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2700 ; AVX2-NEXT: vzeroupper
2701 ; AVX2-NEXT: callq __truncdfhf2
2702 ; AVX2-NEXT: movzwl %ax, %r14d
2703 ; AVX2-NEXT: orl %ebx, %r14d
2704 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2705 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2706 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2707 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2708 ; AVX2-NEXT: vzeroupper
2709 ; AVX2-NEXT: callq __truncdfhf2
2710 ; AVX2-NEXT: movl %eax, %ebx
2711 ; AVX2-NEXT: shll $16, %ebx
2712 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2713 ; AVX2-NEXT: callq __truncdfhf2
2714 ; AVX2-NEXT: movzwl %ax, %eax
2715 ; AVX2-NEXT: orl %ebx, %eax
2716 ; AVX2-NEXT: shlq $32, %rax
2717 ; AVX2-NEXT: orq %r14, %rax
2718 ; AVX2-NEXT: vmovq %rax, %xmm0
2719 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2720 ; AVX2-NEXT: addq $40, %rsp
2721 ; AVX2-NEXT: popq %rbx
2722 ; AVX2-NEXT: popq %r14
2723 ; AVX2-NEXT: retq
2724 ;
2725 ; AVX512-LABEL: cvt_4f64_to_8i16_undef:
2726 ; AVX512: # %bb.0:
2727 ; AVX512-NEXT: pushq %r14
2728 ; AVX512-NEXT: pushq %rbx
2729 ; AVX512-NEXT: subq $40, %rsp
2730 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2731 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2732 ; AVX512-NEXT: vzeroupper
2733 ; AVX512-NEXT: callq __truncdfhf2
2734 ; AVX512-NEXT: movl %eax, %ebx
2735 ; AVX512-NEXT: shll $16, %ebx
2736 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2737 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2738 ; AVX512-NEXT: vzeroupper
2739 ; AVX512-NEXT: callq __truncdfhf2
2740 ; AVX512-NEXT: movzwl %ax, %r14d
2741 ; AVX512-NEXT: orl %ebx, %r14d
2742 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2743 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
2744 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2745 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2746 ; AVX512-NEXT: vzeroupper
2747 ; AVX512-NEXT: callq __truncdfhf2
2748 ; AVX512-NEXT: movl %eax, %ebx
2749 ; AVX512-NEXT: shll $16, %ebx
2750 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2751 ; AVX512-NEXT: callq __truncdfhf2
2752 ; AVX512-NEXT: movzwl %ax, %eax
2753 ; AVX512-NEXT: orl %ebx, %eax
2754 ; AVX512-NEXT: shlq $32, %rax
2755 ; AVX512-NEXT: orq %r14, %rax
2756 ; AVX512-NEXT: vmovq %rax, %xmm0
2757 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2758 ; AVX512-NEXT: addq $40, %rsp
2759 ; AVX512-NEXT: popq %rbx
2760 ; AVX512-NEXT: popq %r14
2761 ; AVX512-NEXT: retq
2326 ; ALL-LABEL: cvt_4f64_to_8i16_undef:
2327 ; ALL: # %bb.0:
2328 ; ALL-NEXT: subq $88, %rsp
2329 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2330 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2331 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2332 ; ALL-NEXT: vzeroupper
2333 ; ALL-NEXT: callq __truncdfhf2
2334 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2335 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2336 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2337 ; ALL-NEXT: vzeroupper
2338 ; ALL-NEXT: callq __truncdfhf2
2339 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2340 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2341 ; ALL-NEXT: # xmm0 = mem[1,0]
2342 ; ALL-NEXT: callq __truncdfhf2
2343 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2344 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2345 ; ALL-NEXT: # xmm0 = mem[1,0]
2346 ; ALL-NEXT: callq __truncdfhf2
2347 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2348 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2349 ; ALL-NEXT: addq $88, %rsp
2350 ; ALL-NEXT: retq
27622351 %1 = fptrunc <4 x double> %a0 to <4 x half>
27632352 %2 = bitcast <4 x half> %1 to <4 x i16>
27642353 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32>
27662355 }
27672356
27682357 define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
2769 ; AVX1-LABEL: cvt_4f64_to_8i16_zero:
2770 ; AVX1: # %bb.0:
2771 ; AVX1-NEXT: pushq %r14
2772 ; AVX1-NEXT: pushq %rbx
2773 ; AVX1-NEXT: subq $40, %rsp
2774 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2775 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2776 ; AVX1-NEXT: vzeroupper
2777 ; AVX1-NEXT: callq __truncdfhf2
2778 ; AVX1-NEXT: movl %eax, %ebx
2779 ; AVX1-NEXT: shll $16, %ebx
2780 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2781 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2782 ; AVX1-NEXT: vzeroupper
2783 ; AVX1-NEXT: callq __truncdfhf2
2784 ; AVX1-NEXT: movzwl %ax, %r14d
2785 ; AVX1-NEXT: orl %ebx, %r14d
2786 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2787 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2788 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2789 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2790 ; AVX1-NEXT: vzeroupper
2791 ; AVX1-NEXT: callq __truncdfhf2
2792 ; AVX1-NEXT: movl %eax, %ebx
2793 ; AVX1-NEXT: shll $16, %ebx
2794 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2795 ; AVX1-NEXT: callq __truncdfhf2
2796 ; AVX1-NEXT: movzwl %ax, %eax
2797 ; AVX1-NEXT: orl %ebx, %eax
2798 ; AVX1-NEXT: shlq $32, %rax
2799 ; AVX1-NEXT: orq %r14, %rax
2800 ; AVX1-NEXT: vmovq %rax, %xmm0
2801 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2802 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2803 ; AVX1-NEXT: addq $40, %rsp
2804 ; AVX1-NEXT: popq %rbx
2805 ; AVX1-NEXT: popq %r14
2806 ; AVX1-NEXT: retq
2807 ;
2808 ; AVX2-SLOW-LABEL: cvt_4f64_to_8i16_zero:
2809 ; AVX2-SLOW: # %bb.0:
2810 ; AVX2-SLOW-NEXT: pushq %r14
2811 ; AVX2-SLOW-NEXT: pushq %rbx
2812 ; AVX2-SLOW-NEXT: subq $40, %rsp
2813 ; AVX2-SLOW-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2814 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2815 ; AVX2-SLOW-NEXT: vzeroupper
2816 ; AVX2-SLOW-NEXT: callq __truncdfhf2
2817 ; AVX2-SLOW-NEXT: movl %eax, %ebx
2818 ; AVX2-SLOW-NEXT: shll $16, %ebx
2819 ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2820 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2821 ; AVX2-SLOW-NEXT: vzeroupper
2822 ; AVX2-SLOW-NEXT: callq __truncdfhf2
2823 ; AVX2-SLOW-NEXT: movzwl %ax, %r14d
2824 ; AVX2-SLOW-NEXT: orl %ebx, %r14d
2825 ; AVX2-SLOW-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2826 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
2827 ; AVX2-SLOW-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2828 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2829 ; AVX2-SLOW-NEXT: vzeroupper
2830 ; AVX2-SLOW-NEXT: callq __truncdfhf2
2831 ; AVX2-SLOW-NEXT: movl %eax, %ebx
2832 ; AVX2-SLOW-NEXT: shll $16, %ebx
2833 ; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2834 ; AVX2-SLOW-NEXT: callq __truncdfhf2
2835 ; AVX2-SLOW-NEXT: movzwl %ax, %eax
2836 ; AVX2-SLOW-NEXT: orl %ebx, %eax
2837 ; AVX2-SLOW-NEXT: shlq $32, %rax
2838 ; AVX2-SLOW-NEXT: orq %r14, %rax
2839 ; AVX2-SLOW-NEXT: vmovq %rax, %xmm0
2840 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2841 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2842 ; AVX2-SLOW-NEXT: addq $40, %rsp
2843 ; AVX2-SLOW-NEXT: popq %rbx
2844 ; AVX2-SLOW-NEXT: popq %r14
2845 ; AVX2-SLOW-NEXT: retq
2846 ;
2847 ; AVX2-FAST-LABEL: cvt_4f64_to_8i16_zero:
2848 ; AVX2-FAST: # %bb.0:
2849 ; AVX2-FAST-NEXT: pushq %r14
2850 ; AVX2-FAST-NEXT: pushq %rbx
2851 ; AVX2-FAST-NEXT: subq $40, %rsp
2852 ; AVX2-FAST-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2853 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2854 ; AVX2-FAST-NEXT: vzeroupper
2855 ; AVX2-FAST-NEXT: callq __truncdfhf2
2856 ; AVX2-FAST-NEXT: movl %eax, %ebx
2857 ; AVX2-FAST-NEXT: shll $16, %ebx
2858 ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2859 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2860 ; AVX2-FAST-NEXT: vzeroupper
2861 ; AVX2-FAST-NEXT: callq __truncdfhf2
2862 ; AVX2-FAST-NEXT: movzwl %ax, %r14d
2863 ; AVX2-FAST-NEXT: orl %ebx, %r14d
2864 ; AVX2-FAST-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2865 ; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
2866 ; AVX2-FAST-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2867 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2868 ; AVX2-FAST-NEXT: vzeroupper
2869 ; AVX2-FAST-NEXT: callq __truncdfhf2
2870 ; AVX2-FAST-NEXT: movl %eax, %ebx
2871 ; AVX2-FAST-NEXT: shll $16, %ebx
2872 ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2873 ; AVX2-FAST-NEXT: callq __truncdfhf2
2874 ; AVX2-FAST-NEXT: movzwl %ax, %eax
2875 ; AVX2-FAST-NEXT: orl %ebx, %eax
2876 ; AVX2-FAST-NEXT: shlq $32, %rax
2877 ; AVX2-FAST-NEXT: orq %r14, %rax
2878 ; AVX2-FAST-NEXT: vmovq %rax, %xmm0
2879 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
2880 ; AVX2-FAST-NEXT: addq $40, %rsp
2881 ; AVX2-FAST-NEXT: popq %rbx
2882 ; AVX2-FAST-NEXT: popq %r14
2883 ; AVX2-FAST-NEXT: retq
2884 ;
2885 ; AVX512F-LABEL: cvt_4f64_to_8i16_zero:
2886 ; AVX512F: # %bb.0:
2887 ; AVX512F-NEXT: pushq %r14
2888 ; AVX512F-NEXT: pushq %rbx
2889 ; AVX512F-NEXT: subq $40, %rsp
2890 ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2891 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2892 ; AVX512F-NEXT: vzeroupper
2893 ; AVX512F-NEXT: callq __truncdfhf2
2894 ; AVX512F-NEXT: movl %eax, %ebx
2895 ; AVX512F-NEXT: shll $16, %ebx
2896 ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2897 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2898 ; AVX512F-NEXT: vzeroupper
2899 ; AVX512F-NEXT: callq __truncdfhf2
2900 ; AVX512F-NEXT: movzwl %ax, %r14d
2901 ; AVX512F-NEXT: orl %ebx, %r14d
2902 ; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2903 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
2904 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2905 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2906 ; AVX512F-NEXT: vzeroupper
2907 ; AVX512F-NEXT: callq __truncdfhf2
2908 ; AVX512F-NEXT: movl %eax, %ebx
2909 ; AVX512F-NEXT: shll $16, %ebx
2910 ; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2911 ; AVX512F-NEXT: callq __truncdfhf2
2912 ; AVX512F-NEXT: movzwl %ax, %eax
2913 ; AVX512F-NEXT: orl %ebx, %eax
2914 ; AVX512F-NEXT: shlq $32, %rax
2915 ; AVX512F-NEXT: orq %r14, %rax
2916 ; AVX512F-NEXT: vmovq %rax, %xmm0
2917 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2918 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
2919 ; AVX512F-NEXT: addq $40, %rsp
2920 ; AVX512F-NEXT: popq %rbx
2921 ; AVX512F-NEXT: popq %r14
2922 ; AVX512F-NEXT: retq
2923 ;
2924 ; AVX512VL-LABEL: cvt_4f64_to_8i16_zero:
2925 ; AVX512VL: # %bb.0:
2926 ; AVX512VL-NEXT: pushq %r14
2927 ; AVX512VL-NEXT: pushq %rbx
2928 ; AVX512VL-NEXT: subq $40, %rsp
2929 ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2930 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2931 ; AVX512VL-NEXT: vzeroupper
2932 ; AVX512VL-NEXT: callq __truncdfhf2
2933 ; AVX512VL-NEXT: movl %eax, %ebx
2934 ; AVX512VL-NEXT: shll $16, %ebx
2935 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2936 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2937 ; AVX512VL-NEXT: vzeroupper
2938 ; AVX512VL-NEXT: callq __truncdfhf2
2939 ; AVX512VL-NEXT: movzwl %ax, %r14d
2940 ; AVX512VL-NEXT: orl %ebx, %r14d
2941 ; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
2942 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
2943 ; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
2944 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
2945 ; AVX512VL-NEXT: vzeroupper
2946 ; AVX512VL-NEXT: callq __truncdfhf2
2947 ; AVX512VL-NEXT: movl %eax, %ebx
2948 ; AVX512VL-NEXT: shll $16, %ebx
2949 ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
2950 ; AVX512VL-NEXT: callq __truncdfhf2
2951 ; AVX512VL-NEXT: movzwl %ax, %eax
2952 ; AVX512VL-NEXT: orl %ebx, %eax
2953 ; AVX512VL-NEXT: shlq $32, %rax
2954 ; AVX512VL-NEXT: orq %r14, %rax
2955 ; AVX512VL-NEXT: vmovq %rax, %xmm0
2956 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
2957 ; AVX512VL-NEXT: addq $40, %rsp
2958 ; AVX512VL-NEXT: popq %rbx
2959 ; AVX512VL-NEXT: popq %r14
2960 ; AVX512VL-NEXT: retq
2358 ; ALL-LABEL: cvt_4f64_to_8i16_zero:
2359 ; ALL: # %bb.0:
2360 ; ALL-NEXT: subq $88, %rsp
2361 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2362 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2363 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2364 ; ALL-NEXT: vzeroupper
2365 ; ALL-NEXT: callq __truncdfhf2
2366 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2367 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2368 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2369 ; ALL-NEXT: vzeroupper
2370 ; ALL-NEXT: callq __truncdfhf2
2371 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2372 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2373 ; ALL-NEXT: # xmm0 = mem[1,0]
2374 ; ALL-NEXT: callq __truncdfhf2
2375 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2376 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2377 ; ALL-NEXT: # xmm0 = mem[1,0]
2378 ; ALL-NEXT: callq __truncdfhf2
2379 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2380 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2381 ; ALL-NEXT: addq $88, %rsp
2382 ; ALL-NEXT: retq
29612383 %1 = fptrunc <4 x double> %a0 to <4 x half>
29622384 %2 = bitcast <4 x half> %1 to <4 x i16>
29632385 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32>
33372759 }
33382760
33392761 define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind {
3340 ; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
3341 ; AVX1: # %bb.0:
3342 ; AVX1-NEXT: pushq %rbp
3343 ; AVX1-NEXT: pushq %r14
3344 ; AVX1-NEXT: pushq %rbx
3345 ; AVX1-NEXT: subq $32, %rsp
3346 ; AVX1-NEXT: movq %rdi, %r14
3347 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3348 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3349 ; AVX1-NEXT: vzeroupper
3350 ; AVX1-NEXT: callq __truncdfhf2
3351 ; AVX1-NEXT: movl %eax, %ebp
3352 ; AVX1-NEXT: shll $16, %ebp
3353 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3354 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3355 ; AVX1-NEXT: vzeroupper
3356 ; AVX1-NEXT: callq __truncdfhf2
3357 ; AVX1-NEXT: movzwl %ax, %ebx
3358 ; AVX1-NEXT: orl %ebp, %ebx
3359 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3360 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3361 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3362 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3363 ; AVX1-NEXT: vzeroupper
3364 ; AVX1-NEXT: callq __truncdfhf2
3365 ; AVX1-NEXT: movl %eax, %ebp
3366 ; AVX1-NEXT: shll $16, %ebp
3367 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3368 ; AVX1-NEXT: callq __truncdfhf2
3369 ; AVX1-NEXT: movzwl %ax, %eax
3370 ; AVX1-NEXT: orl %ebp, %eax
3371 ; AVX1-NEXT: shlq $32, %rax
3372 ; AVX1-NEXT: orq %rbx, %rax
3373 ; AVX1-NEXT: vmovq %rax, %xmm0
3374 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3375 ; AVX1-NEXT: vmovdqa %xmm0, (%r14)
3376 ; AVX1-NEXT: addq $32, %rsp
3377 ; AVX1-NEXT: popq %rbx
3378 ; AVX1-NEXT: popq %r14
3379 ; AVX1-NEXT: popq %rbp
3380 ; AVX1-NEXT: retq
3381 ;
3382 ; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
3383 ; AVX2: # %bb.0:
3384 ; AVX2-NEXT: pushq %rbp
3385 ; AVX2-NEXT: pushq %r14
3386 ; AVX2-NEXT: pushq %rbx
3387 ; AVX2-NEXT: subq $32, %rsp
3388 ; AVX2-NEXT: movq %rdi, %r14
3389 ; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3390 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3391 ; AVX2-NEXT: vzeroupper
3392 ; AVX2-NEXT: callq __truncdfhf2
3393 ; AVX2-NEXT: movl %eax, %ebp
3394 ; AVX2-NEXT: shll $16, %ebp
3395 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3396 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3397 ; AVX2-NEXT: vzeroupper
3398 ; AVX2-NEXT: callq __truncdfhf2
3399 ; AVX2-NEXT: movzwl %ax, %ebx
3400 ; AVX2-NEXT: orl %ebp, %ebx
3401 ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3402 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
3403 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3404 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3405 ; AVX2-NEXT: vzeroupper
3406 ; AVX2-NEXT: callq __truncdfhf2
3407 ; AVX2-NEXT: movl %eax, %ebp
3408 ; AVX2-NEXT: shll $16, %ebp
3409 ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3410 ; AVX2-NEXT: callq __truncdfhf2
3411 ; AVX2-NEXT: movzwl %ax, %eax
3412 ; AVX2-NEXT: orl %ebp, %eax
3413 ; AVX2-NEXT: shlq $32, %rax
3414 ; AVX2-NEXT: orq %rbx, %rax
3415 ; AVX2-NEXT: vmovq %rax, %xmm0
3416 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3417 ; AVX2-NEXT: vmovdqa %xmm0, (%r14)
3418 ; AVX2-NEXT: addq $32, %rsp
3419 ; AVX2-NEXT: popq %rbx
3420 ; AVX2-NEXT: popq %r14
3421 ; AVX2-NEXT: popq %rbp
3422 ; AVX2-NEXT: retq
3423 ;
3424 ; AVX512-LABEL: store_cvt_4f64_to_8i16_undef:
3425 ; AVX512: # %bb.0:
3426 ; AVX512-NEXT: pushq %rbp
3427 ; AVX512-NEXT: pushq %r14
3428 ; AVX512-NEXT: pushq %rbx
3429 ; AVX512-NEXT: subq $32, %rsp
3430 ; AVX512-NEXT: movq %rdi, %r14
3431 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3432 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3433 ; AVX512-NEXT: vzeroupper
3434 ; AVX512-NEXT: callq __truncdfhf2
3435 ; AVX512-NEXT: movl %eax, %ebp
3436 ; AVX512-NEXT: shll $16, %ebp
3437 ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3438 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3439 ; AVX512-NEXT: vzeroupper
3440 ; AVX512-NEXT: callq __truncdfhf2
3441 ; AVX512-NEXT: movzwl %ax, %ebx
3442 ; AVX512-NEXT: orl %ebp, %ebx
3443 ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3444 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
3445 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3446 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3447 ; AVX512-NEXT: vzeroupper
3448 ; AVX512-NEXT: callq __truncdfhf2
3449 ; AVX512-NEXT: movl %eax, %ebp
3450 ; AVX512-NEXT: shll $16, %ebp
3451 ; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3452 ; AVX512-NEXT: callq __truncdfhf2
3453 ; AVX512-NEXT: movzwl %ax, %eax
3454 ; AVX512-NEXT: orl %ebp, %eax
3455 ; AVX512-NEXT: shlq $32, %rax
3456 ; AVX512-NEXT: orq %rbx, %rax
3457 ; AVX512-NEXT: vmovq %rax, %xmm0
3458 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3459 ; AVX512-NEXT: vmovdqa %xmm0, (%r14)
3460 ; AVX512-NEXT: addq $32, %rsp
3461 ; AVX512-NEXT: popq %rbx
3462 ; AVX512-NEXT: popq %r14
3463 ; AVX512-NEXT: popq %rbp
3464 ; AVX512-NEXT: retq
2762 ; ALL-LABEL: store_cvt_4f64_to_8i16_undef:
2763 ; ALL: # %bb.0:
2764 ; ALL-NEXT: pushq %rbx
2765 ; ALL-NEXT: subq $80, %rsp
2766 ; ALL-NEXT: movq %rdi, %rbx
2767 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2768 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2769 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2770 ; ALL-NEXT: vzeroupper
2771 ; ALL-NEXT: callq __truncdfhf2
2772 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2773 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2774 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2775 ; ALL-NEXT: vzeroupper
2776 ; ALL-NEXT: callq __truncdfhf2
2777 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2778 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2779 ; ALL-NEXT: # xmm0 = mem[1,0]
2780 ; ALL-NEXT: callq __truncdfhf2
2781 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2782 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2783 ; ALL-NEXT: # xmm0 = mem[1,0]
2784 ; ALL-NEXT: callq __truncdfhf2
2785 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2786 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2787 ; ALL-NEXT: vmovaps %xmm0, (%rbx)
2788 ; ALL-NEXT: addq $80, %rsp
2789 ; ALL-NEXT: popq %rbx
2790 ; ALL-NEXT: retq
34652791 %1 = fptrunc <4 x double> %a0 to <4 x half>
34662792 %2 = bitcast <4 x half> %1 to <4 x i16>
34672793 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32>
34702796 }
34712797
34722798 define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind {
3473 ; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
3474 ; AVX1: # %bb.0:
3475 ; AVX1-NEXT: pushq %rbp
3476 ; AVX1-NEXT: pushq %r14
3477 ; AVX1-NEXT: pushq %rbx
3478 ; AVX1-NEXT: subq $32, %rsp
3479 ; AVX1-NEXT: movq %rdi, %r14
3480 ; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3481 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3482 ; AVX1-NEXT: vzeroupper
3483 ; AVX1-NEXT: callq __truncdfhf2
3484 ; AVX1-NEXT: movl %eax, %ebp
3485 ; AVX1-NEXT: shll $16, %ebp
3486 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3487 ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3488 ; AVX1-NEXT: vzeroupper
3489 ; AVX1-NEXT: callq __truncdfhf2
3490 ; AVX1-NEXT: movzwl %ax, %ebx
3491 ; AVX1-NEXT: orl %ebp, %ebx
3492 ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3493 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3494 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3495 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3496 ; AVX1-NEXT: vzeroupper
3497 ; AVX1-NEXT: callq __truncdfhf2
3498 ; AVX1-NEXT: movl %eax, %ebp
3499 ; AVX1-NEXT: shll $16, %ebp
3500 ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3501 ; AVX1-NEXT: callq __truncdfhf2
3502 ; AVX1-NEXT: movzwl %ax, %eax
3503 ; AVX1-NEXT: orl %ebp, %eax
3504 ; AVX1-NEXT: shlq $32, %rax
3505 ; AVX1-NEXT: orq %rbx, %rax
3506 ; AVX1-NEXT: vmovq %rax, %xmm0
3507 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3508 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
3509 ; AVX1-NEXT: vmovdqa %xmm0, (%r14)
3510 ; AVX1-NEXT: addq $32, %rsp
3511 ; AVX1-NEXT: popq %rbx
3512 ; AVX1-NEXT: popq %r14
3513 ; AVX1-NEXT: popq %rbp
3514 ; AVX1-NEXT: retq
3515 ;
3516 ; AVX2-SLOW-LABEL: store_cvt_4f64_to_8i16_zero:
3517 ; AVX2-SLOW: # %bb.0:
3518 ; AVX2-SLOW-NEXT: pushq %rbp
3519 ; AVX2-SLOW-NEXT: pushq %r14
3520 ; AVX2-SLOW-NEXT: pushq %rbx
3521 ; AVX2-SLOW-NEXT: subq $32, %rsp
3522 ; AVX2-SLOW-NEXT: movq %rdi, %r14
3523 ; AVX2-SLOW-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3524 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3525 ; AVX2-SLOW-NEXT: vzeroupper
3526 ; AVX2-SLOW-NEXT: callq __truncdfhf2
3527 ; AVX2-SLOW-NEXT: movl %eax, %ebp
3528 ; AVX2-SLOW-NEXT: shll $16, %ebp
3529 ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3530 ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3531 ; AVX2-SLOW-NEXT: vzeroupper
3532 ; AVX2-SLOW-NEXT: callq __truncdfhf2
3533 ; AVX2-SLOW-NEXT: movzwl %ax, %ebx
3534 ; AVX2-SLOW-NEXT: orl %ebp, %ebx
3535 ; AVX2-SLOW-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3536 ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
3537 ; AVX2-SLOW-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3538 ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3539 ; AVX2-SLOW-NEXT: vzeroupper
3540 ; AVX2-SLOW-NEXT: callq __truncdfhf2
3541 ; AVX2-SLOW-NEXT: movl %eax, %ebp
3542 ; AVX2-SLOW-NEXT: shll $16, %ebp
3543 ; AVX2-SLOW-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3544 ; AVX2-SLOW-NEXT: callq __truncdfhf2
3545 ; AVX2-SLOW-NEXT: movzwl %ax, %eax
3546 ; AVX2-SLOW-NEXT: orl %ebp, %eax
3547 ; AVX2-SLOW-NEXT: shlq $32, %rax
3548 ; AVX2-SLOW-NEXT: orq %rbx, %rax
3549 ; AVX2-SLOW-NEXT: vmovq %rax, %xmm0
3550 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3551 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
3552 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r14)
3553 ; AVX2-SLOW-NEXT: addq $32, %rsp
3554 ; AVX2-SLOW-NEXT: popq %rbx
3555 ; AVX2-SLOW-NEXT: popq %r14
3556 ; AVX2-SLOW-NEXT: popq %rbp
3557 ; AVX2-SLOW-NEXT: retq
3558 ;
3559 ; AVX2-FAST-LABEL: store_cvt_4f64_to_8i16_zero:
3560 ; AVX2-FAST: # %bb.0:
3561 ; AVX2-FAST-NEXT: pushq %rbp
3562 ; AVX2-FAST-NEXT: pushq %r14
3563 ; AVX2-FAST-NEXT: pushq %rbx
3564 ; AVX2-FAST-NEXT: subq $32, %rsp
3565 ; AVX2-FAST-NEXT: movq %rdi, %r14
3566 ; AVX2-FAST-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3567 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3568 ; AVX2-FAST-NEXT: vzeroupper
3569 ; AVX2-FAST-NEXT: callq __truncdfhf2
3570 ; AVX2-FAST-NEXT: movl %eax, %ebp
3571 ; AVX2-FAST-NEXT: shll $16, %ebp
3572 ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3573 ; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3574 ; AVX2-FAST-NEXT: vzeroupper
3575 ; AVX2-FAST-NEXT: callq __truncdfhf2
3576 ; AVX2-FAST-NEXT: movzwl %ax, %ebx
3577 ; AVX2-FAST-NEXT: orl %ebp, %ebx
3578 ; AVX2-FAST-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3579 ; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
3580 ; AVX2-FAST-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3581 ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3582 ; AVX2-FAST-NEXT: vzeroupper
3583 ; AVX2-FAST-NEXT: callq __truncdfhf2
3584 ; AVX2-FAST-NEXT: movl %eax, %ebp
3585 ; AVX2-FAST-NEXT: shll $16, %ebp
3586 ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3587 ; AVX2-FAST-NEXT: callq __truncdfhf2
3588 ; AVX2-FAST-NEXT: movzwl %ax, %eax
3589 ; AVX2-FAST-NEXT: orl %ebp, %eax
3590 ; AVX2-FAST-NEXT: shlq $32, %rax
3591 ; AVX2-FAST-NEXT: orq %rbx, %rax
3592 ; AVX2-FAST-NEXT: vmovq %rax, %xmm0
3593 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
3594 ; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r14)
3595 ; AVX2-FAST-NEXT: addq $32, %rsp
3596 ; AVX2-FAST-NEXT: popq %rbx
3597 ; AVX2-FAST-NEXT: popq %r14
3598 ; AVX2-FAST-NEXT: popq %rbp
3599 ; AVX2-FAST-NEXT: retq
3600 ;
3601 ; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero:
3602 ; AVX512F: # %bb.0:
3603 ; AVX512F-NEXT: pushq %rbp
3604 ; AVX512F-NEXT: pushq %r14
3605 ; AVX512F-NEXT: pushq %rbx
3606 ; AVX512F-NEXT: subq $32, %rsp
3607 ; AVX512F-NEXT: movq %rdi, %r14
3608 ; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3609 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3610 ; AVX512F-NEXT: vzeroupper
3611 ; AVX512F-NEXT: callq __truncdfhf2
3612 ; AVX512F-NEXT: movl %eax, %ebp
3613 ; AVX512F-NEXT: shll $16, %ebp
3614 ; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3615 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3616 ; AVX512F-NEXT: vzeroupper
3617 ; AVX512F-NEXT: callq __truncdfhf2
3618 ; AVX512F-NEXT: movzwl %ax, %ebx
3619 ; AVX512F-NEXT: orl %ebp, %ebx
3620 ; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3621 ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
3622 ; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3623 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3624 ; AVX512F-NEXT: vzeroupper
3625 ; AVX512F-NEXT: callq __truncdfhf2
3626 ; AVX512F-NEXT: movl %eax, %ebp
3627 ; AVX512F-NEXT: shll $16, %ebp
3628 ; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3629 ; AVX512F-NEXT: callq __truncdfhf2
3630 ; AVX512F-NEXT: movzwl %ax, %eax
3631 ; AVX512F-NEXT: orl %ebp, %eax
3632 ; AVX512F-NEXT: shlq $32, %rax
3633 ; AVX512F-NEXT: orq %rbx, %rax
3634 ; AVX512F-NEXT: vmovq %rax, %xmm0
3635 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3636 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
3637 ; AVX512F-NEXT: vmovdqa %xmm0, (%r14)
3638 ; AVX512F-NEXT: addq $32, %rsp
3639 ; AVX512F-NEXT: popq %rbx
3640 ; AVX512F-NEXT: popq %r14
3641 ; AVX512F-NEXT: popq %rbp
3642 ; AVX512F-NEXT: retq
3643 ;
3644 ; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero:
3645 ; AVX512VL: # %bb.0:
3646 ; AVX512VL-NEXT: pushq %rbp
3647 ; AVX512VL-NEXT: pushq %r14
3648 ; AVX512VL-NEXT: pushq %rbx
3649 ; AVX512VL-NEXT: subq $32, %rsp
3650 ; AVX512VL-NEXT: movq %rdi, %r14
3651 ; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
3652 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3653 ; AVX512VL-NEXT: vzeroupper
3654 ; AVX512VL-NEXT: callq __truncdfhf2
3655 ; AVX512VL-NEXT: movl %eax, %ebp
3656 ; AVX512VL-NEXT: shll $16, %ebp
3657 ; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
3658 ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
3659 ; AVX512VL-NEXT: vzeroupper
3660 ; AVX512VL-NEXT: callq __truncdfhf2
3661 ; AVX512VL-NEXT: movzwl %ax, %ebx
3662 ; AVX512VL-NEXT: orl %ebp, %ebx
3663 ; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
3664 ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
3665 ; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
3666 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
3667 ; AVX512VL-NEXT: vzeroupper
3668 ; AVX512VL-NEXT: callq __truncdfhf2
3669 ; AVX512VL-NEXT: movl %eax, %ebp
3670 ; AVX512VL-NEXT: shll $16, %ebp
3671 ; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
3672 ; AVX512VL-NEXT: callq __truncdfhf2
3673 ; AVX512VL-NEXT: movzwl %ax, %eax
3674 ; AVX512VL-NEXT: orl %ebp, %eax
3675 ; AVX512VL-NEXT: shlq $32, %rax
3676 ; AVX512VL-NEXT: orq %rbx, %rax
3677 ; AVX512VL-NEXT: vmovq %rax, %xmm0
3678 ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
3679 ; AVX512VL-NEXT: vmovdqa %xmm0, (%r14)
3680 ; AVX512VL-NEXT: addq $32, %rsp
3681 ; AVX512VL-NEXT: popq %rbx
3682 ; AVX512VL-NEXT: popq %r14
3683 ; AVX512VL-NEXT: popq %rbp
3684 ; AVX512VL-NEXT: retq
2799 ; ALL-LABEL: store_cvt_4f64_to_8i16_zero:
2800 ; ALL: # %bb.0:
2801 ; ALL-NEXT: pushq %rbx
2802 ; ALL-NEXT: subq $80, %rsp
2803 ; ALL-NEXT: movq %rdi, %rbx
2804 ; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2805 ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
2806 ; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2807 ; ALL-NEXT: vzeroupper
2808 ; ALL-NEXT: callq __truncdfhf2
2809 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2810 ; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2811 ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
2812 ; ALL-NEXT: vzeroupper
2813 ; ALL-NEXT: callq __truncdfhf2
2814 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2815 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2816 ; ALL-NEXT: # xmm0 = mem[1,0]
2817 ; ALL-NEXT: callq __truncdfhf2
2818 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2819 ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2820 ; ALL-NEXT: # xmm0 = mem[1,0]
2821 ; ALL-NEXT: callq __truncdfhf2
2822 ; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp)
2823 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
2824 ; ALL-NEXT: vmovaps %xmm0, (%rbx)
2825 ; ALL-NEXT: addq $80, %rsp
2826 ; ALL-NEXT: popq %rbx
2827 ; ALL-NEXT: retq
36852828 %1 = fptrunc <4 x double> %a0 to <4 x half>
36862829 %2 = bitcast <4 x half> %1 to <4 x i16>
36872830 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32>