llvm.org GIT mirror llvm / 982005c
[X86][SSE] Shuffle mask decode support for zero extend, scalar float/double moves and integer load instructions This patch adds shuffle mask decodes for integer zero extends (pmovzx** and movq xmm,xmm) and scalar float/double loads/moves (movss/movsd). Also adds shuffle mask decodes for integer loads (movd/movq). Differential Revision: http://reviews.llvm.org/D7228 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227688 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 5 years ago
10 changed file(s) with 834 addition(s) and 627 deletion(s). Raw diff Collapse all Expand all
2020
2121 using namespace llvm;
2222
23 /// \brief Extracts the src/dst types for a given zero extension instruction.
24 /// \note While the number of elements in DstVT type correct, the
25 /// number in the SrcVT type is expanded to fill the src xmm register and the
26 /// upper elements may not be included in the dst xmm/ymm register.
27 static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) {
28 switch (MI->getOpcode()) {
29 default:
30 llvm_unreachable("Unknown zero extension instruction");
31 // i8 zero extension
32 case X86::PMOVZXBWrm:
33 case X86::PMOVZXBWrr:
34 case X86::VPMOVZXBWrm:
35 case X86::VPMOVZXBWrr:
36 SrcVT = MVT::v16i8;
37 DstVT = MVT::v8i16;
38 break;
39 case X86::VPMOVZXBWYrm:
40 case X86::VPMOVZXBWYrr:
41 SrcVT = MVT::v16i8;
42 DstVT = MVT::v16i16;
43 break;
44 case X86::PMOVZXBDrm:
45 case X86::PMOVZXBDrr:
46 case X86::VPMOVZXBDrm:
47 case X86::VPMOVZXBDrr:
48 SrcVT = MVT::v16i8;
49 DstVT = MVT::v4i32;
50 break;
51 case X86::VPMOVZXBDYrm:
52 case X86::VPMOVZXBDYrr:
53 SrcVT = MVT::v16i8;
54 DstVT = MVT::v8i32;
55 break;
56 case X86::PMOVZXBQrm:
57 case X86::PMOVZXBQrr:
58 case X86::VPMOVZXBQrm:
59 case X86::VPMOVZXBQrr:
60 SrcVT = MVT::v16i8;
61 DstVT = MVT::v2i64;
62 break;
63 case X86::VPMOVZXBQYrm:
64 case X86::VPMOVZXBQYrr:
65 SrcVT = MVT::v16i8;
66 DstVT = MVT::v4i64;
67 break;
68 // i16 zero extension
69 case X86::PMOVZXWDrm:
70 case X86::PMOVZXWDrr:
71 case X86::VPMOVZXWDrm:
72 case X86::VPMOVZXWDrr:
73 SrcVT = MVT::v8i16;
74 DstVT = MVT::v4i32;
75 break;
76 case X86::VPMOVZXWDYrm:
77 case X86::VPMOVZXWDYrr:
78 SrcVT = MVT::v8i16;
79 DstVT = MVT::v8i32;
80 break;
81 case X86::PMOVZXWQrm:
82 case X86::PMOVZXWQrr:
83 case X86::VPMOVZXWQrm:
84 case X86::VPMOVZXWQrr:
85 SrcVT = MVT::v8i16;
86 DstVT = MVT::v2i64;
87 break;
88 case X86::VPMOVZXWQYrm:
89 case X86::VPMOVZXWQYrr:
90 SrcVT = MVT::v8i16;
91 DstVT = MVT::v4i64;
92 break;
93 // i32 zero extension
94 case X86::PMOVZXDQrm:
95 case X86::PMOVZXDQrr:
96 case X86::VPMOVZXDQrm:
97 case X86::VPMOVZXDQrr:
98 SrcVT = MVT::v4i32;
99 DstVT = MVT::v2i64;
100 break;
101 case X86::VPMOVZXDQYrm:
102 case X86::VPMOVZXDQYrr:
103 SrcVT = MVT::v4i32;
104 DstVT = MVT::v4i64;
105 break;
106 }
107 }
108
23109 //===----------------------------------------------------------------------===//
24110 // Top Level Entrypoint
25111 //===----------------------------------------------------------------------===//
749835 ShuffleMask);
750836 DestName = getRegName(MI->getOperand(0).getReg());
751837 break;
838
839 case X86::MOVSDrr:
840 case X86::VMOVSDrr:
841 Src2Name = getRegName(MI->getOperand(2).getReg());
842 Src1Name = getRegName(MI->getOperand(1).getReg());
843 // FALL THROUGH.
844 case X86::MOVSDrm:
845 case X86::VMOVSDrm:
846 DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask);
847 DestName = getRegName(MI->getOperand(0).getReg());
848 break;
849 case X86::MOVSSrr:
850 case X86::VMOVSSrr:
851 Src2Name = getRegName(MI->getOperand(2).getReg());
852 Src1Name = getRegName(MI->getOperand(1).getReg());
853 // FALL THROUGH.
854 case X86::MOVSSrm:
855 case X86::VMOVSSrm:
856 DecodeScalarMoveMask(MVT::v4f32, nullptr == Src2Name, ShuffleMask);
857 DestName = getRegName(MI->getOperand(0).getReg());
858 break;
859
860 case X86::MOVPQI2QIrr:
861 case X86::MOVZPQILo2PQIrr:
862 case X86::VMOVPQI2QIrr:
863 case X86::VMOVZPQILo2PQIrr:
864 Src1Name = getRegName(MI->getOperand(1).getReg());
865 // FALL THROUGH.
866 case X86::MOVQI2PQIrm:
867 case X86::MOVZQI2PQIrm:
868 case X86::MOVZPQILo2PQIrm:
869 case X86::VMOVQI2PQIrm:
870 case X86::VMOVZQI2PQIrm:
871 case X86::VMOVZPQILo2PQIrm:
872 DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask);
873 DestName = getRegName(MI->getOperand(0).getReg());
874 break;
875 case X86::MOVDI2PDIrm:
876 case X86::VMOVDI2PDIrm:
877 DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask);
878 DestName = getRegName(MI->getOperand(0).getReg());
879 break;
880
881 case X86::PMOVZXBWrr:
882 case X86::PMOVZXBDrr:
883 case X86::PMOVZXBQrr:
884 case X86::PMOVZXWDrr:
885 case X86::PMOVZXWQrr:
886 case X86::PMOVZXDQrr:
887 case X86::VPMOVZXBWrr:
888 case X86::VPMOVZXBDrr:
889 case X86::VPMOVZXBQrr:
890 case X86::VPMOVZXWDrr:
891 case X86::VPMOVZXWQrr:
892 case X86::VPMOVZXDQrr:
893 case X86::VPMOVZXBWYrr:
894 case X86::VPMOVZXBDYrr:
895 case X86::VPMOVZXBQYrr:
896 case X86::VPMOVZXWDYrr:
897 case X86::VPMOVZXWQYrr:
898 case X86::VPMOVZXDQYrr:
899 Src1Name = getRegName(MI->getOperand(1).getReg());
900 // FALL THROUGH.
901 case X86::PMOVZXBWrm:
902 case X86::PMOVZXBDrm:
903 case X86::PMOVZXBQrm:
904 case X86::PMOVZXWDrm:
905 case X86::PMOVZXWQrm:
906 case X86::PMOVZXDQrm:
907 case X86::VPMOVZXBWrm:
908 case X86::VPMOVZXBDrm:
909 case X86::VPMOVZXBQrm:
910 case X86::VPMOVZXWDrm:
911 case X86::VPMOVZXWQrm:
912 case X86::VPMOVZXDQrm:
913 case X86::VPMOVZXBWYrm:
914 case X86::VPMOVZXBDYrm:
915 case X86::VPMOVZXBQYrm:
916 case X86::VPMOVZXWDYrm:
917 case X86::VPMOVZXWQYrm:
918 case X86::VPMOVZXDQYrm: {
919 MVT SrcVT, DstVT;
920 getZeroExtensionTypes(MI, SrcVT, DstVT);
921 DecodeZeroExtendMask(SrcVT, DstVT, ShuffleMask);
922 DestName = getRegName(MI->getOperand(0).getReg());
923 } break;
752924 }
753925
754926 // The only comments we decode are shuffles, so give up if we were unable to
None //===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Define several functions to decode x86 specific shuffle semantics into a
10 // generic vector mask.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "X86ShuffleDecode.h"
15 #include "llvm/IR/Constants.h"
16 #include "llvm/CodeGen/MachineValueType.h"
17
18 //===----------------------------------------------------------------------===//
19 // Vector Mask Decoding
20 //===----------------------------------------------------------------------===//
21
22 namespace llvm {
23
24 void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl &ShuffleMask) {
25 // Defaults the copying the dest value.
26 ShuffleMask.push_back(0);
27 ShuffleMask.push_back(1);
28 ShuffleMask.push_back(2);
29 ShuffleMask.push_back(3);
30
31 // Decode the immediate.
32 unsigned ZMask = Imm & 15;
33 unsigned CountD = (Imm >> 4) & 3;
34 unsigned CountS = (Imm >> 6) & 3;
35
36 // CountS selects which input element to use.
37 unsigned InVal = 4+CountS;
38 // CountD specifies which element of destination to update.
39 ShuffleMask[CountD] = InVal;
40 // ZMask zaps values, potentially overriding the CountD elt.
41 if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
42 if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
43 if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
44 if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
45 }
46
47 // <3,1> or <6,7,2,3>
48 void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) {
49 for (unsigned i = NElts/2; i != NElts; ++i)
50 ShuffleMask.push_back(NElts+i);
51
52 for (unsigned i = NElts/2; i != NElts; ++i)
53 ShuffleMask.push_back(i);
54 }
55
56 // <0,2> or <0,1,4,5>
57 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) {
58 for (unsigned i = 0; i != NElts/2; ++i)
59 ShuffleMask.push_back(i);
60
61 for (unsigned i = 0; i != NElts/2; ++i)
62 ShuffleMask.push_back(NElts+i);
63 }
64
65 void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) {
66 unsigned NumElts = VT.getVectorNumElements();
67 for (int i = 0, e = NumElts / 2; i < e; ++i) {
68 ShuffleMask.push_back(2 * i);
69 ShuffleMask.push_back(2 * i);
70 }
71 }
72
73 void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) {
74 unsigned NumElts = VT.getVectorNumElements();
75 for (int i = 0, e = NumElts / 2; i < e; ++i) {
76 ShuffleMask.push_back(2 * i + 1);
77 ShuffleMask.push_back(2 * i + 1);
0 //===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Define several functions to decode x86 specific shuffle semantics into a
10 // generic vector mask.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "X86ShuffleDecode.h"
15 #include "llvm/IR/Constants.h"
16 #include "llvm/CodeGen/MachineValueType.h"
17
18 //===----------------------------------------------------------------------===//
19 // Vector Mask Decoding
20 //===----------------------------------------------------------------------===//
21
22 namespace llvm {
23
24 void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl &ShuffleMask) {
25 // Defaults the copying the dest value.
26 ShuffleMask.push_back(0);
27 ShuffleMask.push_back(1);
28 ShuffleMask.push_back(2);
29 ShuffleMask.push_back(3);
30
31 // Decode the immediate.
32 unsigned ZMask = Imm & 15;
33 unsigned CountD = (Imm >> 4) & 3;
34 unsigned CountS = (Imm >> 6) & 3;
35
36 // CountS selects which input element to use.
37 unsigned InVal = 4+CountS;
38 // CountD specifies which element of destination to update.
39 ShuffleMask[CountD] = InVal;
40 // ZMask zaps values, potentially overriding the CountD elt.
41 if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
42 if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
43 if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
44 if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
45 }
46
47 // <3,1> or <6,7,2,3>
48 void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) {
49 for (unsigned i = NElts/2; i != NElts; ++i)
50 ShuffleMask.push_back(NElts+i);
51
52 for (unsigned i = NElts/2; i != NElts; ++i)
53 ShuffleMask.push_back(i);
54 }
55
56 // <0,2> or <0,1,4,5>
57 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl &ShuffleMask) {
58 for (unsigned i = 0; i != NElts/2; ++i)
59 ShuffleMask.push_back(i);
60
61 for (unsigned i = 0; i != NElts/2; ++i)
62 ShuffleMask.push_back(NElts+i);
63 }
64
65 void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) {
66 unsigned NumElts = VT.getVectorNumElements();
67 for (int i = 0, e = NumElts / 2; i < e; ++i) {
68 ShuffleMask.push_back(2 * i);
69 ShuffleMask.push_back(2 * i);
70 }
71 }
72
73 void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl &ShuffleMask) {
74 unsigned NumElts = VT.getVectorNumElements();
75 for (int i = 0, e = NumElts / 2; i < e; ++i) {
76 ShuffleMask.push_back(2 * i + 1);
77 ShuffleMask.push_back(2 * i + 1);
7878 }
7979 }
8080
9595 void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
9696 unsigned VectorSizeInBits = VT.getSizeInBits();
9797 unsigned NumElts = VectorSizeInBits / 8;
98 unsigned NumLanes = VectorSizeInBits / 128;
99 unsigned NumLaneElts = NumElts / NumLanes;
100
101 for (unsigned l = 0; l < NumElts; l += NumLaneElts)
102 for (unsigned i = 0; i < NumLaneElts; ++i) {
103 int M = SM_SentinelZero;
104 if (i >= Imm) M = i - Imm + l;
105 ShuffleMask.push_back(M);
106 }
107 }
108
109 void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
110 unsigned VectorSizeInBits = VT.getSizeInBits();
111 unsigned NumElts = VectorSizeInBits / 8;
112 unsigned NumLanes = VectorSizeInBits / 128;
113 unsigned NumLaneElts = NumElts / NumLanes;
114
115 for (unsigned l = 0; l < NumElts; l += NumLaneElts)
116 for (unsigned i = 0; i < NumLaneElts; ++i) {
117 unsigned Base = i + Imm;
118 int M = Base + l;
119 if (Base >= NumLaneElts) M = SM_SentinelZero;
120 ShuffleMask.push_back(M);
121 }
122 }
123
124 void DecodePALIGNRMask(MVT VT, unsigned Imm,
125 SmallVectorImpl &ShuffleMask) {
126 unsigned NumElts = VT.getVectorNumElements();
127 unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
128
129 unsigned NumLanes = VT.getSizeInBits() / 128;
130 unsigned NumLaneElts = NumElts / NumLanes;
131
132 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
133 for (unsigned i = 0; i != NumLaneElts; ++i) {
134 unsigned Base = i + Offset;
135 // if i+offset is out of this lane then we actually need the other source
136 if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
137 ShuffleMask.push_back(Base + l);
138 }
139 }
140 }
141
142 /// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
143 /// VT indicates the type of the vector allowing it to handle different
144 /// datatypes and vector widths.
145 void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
146 unsigned NumElts = VT.getVectorNumElements();
147
148 unsigned NumLanes = VT.getSizeInBits() / 128;
149 unsigned NumLaneElts = NumElts / NumLanes;
150
151 unsigned NewImm = Imm;
152 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
153 for (unsigned i = 0; i != NumLaneElts; ++i) {
154 ShuffleMask.push_back(NewImm % NumLaneElts + l);
155 NewImm /= NumLaneElts;
156 }
157 if (NumLaneElts == 4) NewImm = Imm; // reload imm
158 }
159 }
160
161 void DecodePSHUFHWMask(MVT VT, unsigned Imm,
162 SmallVectorImpl &ShuffleMask) {
163 unsigned NumElts = VT.getVectorNumElements();
164
165 for (unsigned l = 0; l != NumElts; l += 8) {
166 unsigned NewImm = Imm;
167 for (unsigned i = 0, e = 4; i != e; ++i) {
168 ShuffleMask.push_back(l + i);
169 }
170 for (unsigned i = 4, e = 8; i != e; ++i) {
171 ShuffleMask.push_back(l + 4 + (NewImm & 3));
172 NewImm >>= 2;
173 }
174 }
175 }
176
177 void DecodePSHUFLWMask(MVT VT, unsigned Imm,
178 SmallVectorImpl &ShuffleMask) {
179 unsigned NumElts = VT.getVectorNumElements();
180
181 for (unsigned l = 0; l != NumElts; l += 8) {
182 unsigned NewImm = Imm;
183 for (unsigned i = 0, e = 4; i != e; ++i) {
184 ShuffleMask.push_back(l + (NewImm & 3));
185 NewImm >>= 2;
186 }
187 for (unsigned i = 4, e = 8; i != e; ++i) {
188 ShuffleMask.push_back(l + i);
189 }
190 }
191 }
192
193 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
194 /// the type of the vector allowing it to handle different datatypes and vector
195 /// widths.
196 void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
197 unsigned NumElts = VT.getVectorNumElements();
198
199 unsigned NumLanes = VT.getSizeInBits() / 128;
200 unsigned NumLaneElts = NumElts / NumLanes;
201
202 unsigned NewImm = Imm;
203 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
204 // each half of a lane comes from different source
205 for (unsigned s = 0; s != NumElts*2; s += NumElts) {
206 for (unsigned i = 0; i != NumLaneElts/2; ++i) {
207 ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
208 NewImm /= NumLaneElts;
209 }
210 }
211 if (NumLaneElts == 4) NewImm = Imm; // reload imm
212 }
213 }
214
215 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
216 /// and punpckh*. VT indicates the type of the vector allowing it to handle
217 /// different datatypes and vector widths.
218 void DecodeUNPCKHMask(MVT VT, SmallVectorImpl &ShuffleMask) {
219 unsigned NumElts = VT.getVectorNumElements();
220
221 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
222 // independently on 128-bit lanes.
223 unsigned NumLanes = VT.getSizeInBits() / 128;
224 if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
225 unsigned NumLaneElts = NumElts / NumLanes;
226
227 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
228 for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
229 ShuffleMask.push_back(i); // Reads from dest/src1
230 ShuffleMask.push_back(i+NumElts); // Reads from src/src2
231 }
232 }
233 }
234
235 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
236 /// and punpckl*. VT indicates the type of the vector allowing it to handle
237 /// different datatypes and vector widths.
238 void DecodeUNPCKLMask(MVT VT, SmallVectorImpl &ShuffleMask) {
239 unsigned NumElts = VT.getVectorNumElements();
240
241 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
242 // independently on 128-bit lanes.
243 unsigned NumLanes = VT.getSizeInBits() / 128;
244 if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
245 unsigned NumLaneElts = NumElts / NumLanes;
246
247 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
248 for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
249 ShuffleMask.push_back(i); // Reads from dest/src1
250 ShuffleMask.push_back(i+NumElts); // Reads from src/src2
251 }
252 }
253 }
254
255 void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
256 SmallVectorImpl &ShuffleMask) {
257 if (Imm & 0x88)
258 return; // Not a shuffle
259
260 unsigned HalfSize = VT.getVectorNumElements()/2;
261
262 for (unsigned l = 0; l != 2; ++l) {
263 unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
264 for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
265 ShuffleMask.push_back(i);
266 }
267 }
268
269 void DecodePSHUFBMask(const Constant *C, SmallVectorImpl &ShuffleMask) {
270 Type *MaskTy = C->getType();
271 // It is not an error for the PSHUFB mask to not be a vector of i8 because the
272 // constant pool uniques constants by their bit representation.
273 // e.g. the following take up the same space in the constant pool:
274 // i128 -170141183420855150465331762880109871104
275 //
276 // <2 x i64>
277 //
278 // <4 x i32>
279 // i32 -2147483648, i32 -2147483648>
280
281 unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
282
283 if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
284 return;
285
286 // This is a straightforward byte vector.
287 if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
288 int NumElements = MaskTy->getVectorNumElements();
289 ShuffleMask.reserve(NumElements);
290
291 for (int i = 0; i < NumElements; ++i) {
292 // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
293 // lane of the vector we're inside.
294 int Base = i < 16 ? 0 : 16;
295 Constant *COp = C->getAggregateElement(i);
296 if (!COp) {
297 ShuffleMask.clear();
298 return;
299 } else if (isa(COp)) {
300 ShuffleMask.push_back(SM_SentinelUndef);
301 continue;
302 }
303 uint64_t Element = cast(COp)->getZExtValue();
304 // If the high bit (7) of the byte is set, the element is zeroed.
305 if (Element & (1 << 7))
306 ShuffleMask.push_back(SM_SentinelZero);
307 else {
308 // Only the least significant 4 bits of the byte are used.
309 int Index = Base + (Element & 0xf);
310 ShuffleMask.push_back(Index);
311 }
312 }
313 }
314 // TODO: Handle funny-looking vectors too.
315 }
316
317 void DecodePSHUFBMask(ArrayRef RawMask,
318 SmallVectorImpl &ShuffleMask) {
319 for (int i = 0, e = RawMask.size(); i < e; ++i) {
320 uint64_t M = RawMask[i];
321 if (M == (uint64_t)SM_SentinelUndef) {
322 ShuffleMask.push_back(M);
323 continue;
324 }
325 // For AVX vectors with 32 bytes the base of the shuffle is the half of
326 // the vector we're inside.
327 int Base = i < 16 ? 0 : 16;
328 // If the high bit (7) of the byte is set, the element is zeroed.
329 if (M & (1 << 7))
330 ShuffleMask.push_back(SM_SentinelZero);
331 else {
332 // Only the least significant 4 bits of the byte are used.
333 int Index = Base + (M & 0xf);
334 ShuffleMask.push_back(Index);
335 }
336 }
337 }
338
339 void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
340 int ElementBits = VT.getScalarSizeInBits();
341 int NumElements = VT.getVectorNumElements();
342 for (int i = 0; i < NumElements; ++i) {
343 // If there are more than 8 elements in the vector, then any immediate blend
344 // mask applies to each 128-bit lane. There can never be more than
345 // 8 elements in a 128-bit lane with an immediate blend.
346 int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
347 assert(Bit < 8 &&
348 "Immediate blends only operate over 8 elements at a time!");
349 ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
350 }
351 }
352
353 /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
354 /// No VT provided since it only works on 256-bit, 4 element vectors.
355 void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask) {
356 for (unsigned i = 0; i != 4; ++i) {
357 ShuffleMask.push_back((Imm >> (2*i)) & 3);
358 }
359 }
360
361 void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl &ShuffleMask) {
362 Type *MaskTy = C->getType();
363 assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
364 assert(MaskTy->getVectorElementType()->isIntegerTy() &&
365 "Expected integer constant mask elements!");
366 int ElementBits = MaskTy->getScalarSizeInBits();
367 int NumElements = MaskTy->getVectorNumElements();
368 assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
369 "Unexpected number of vector elements.");
370 ShuffleMask.reserve(NumElements);
371 if (auto *CDS = dyn_cast(C)) {
372 assert((unsigned)NumElements == CDS->getNumElements() &&
373 "Constant mask has a different number of elements!");
374
375 for (int i = 0; i < NumElements; ++i) {
376 int Base = (i * ElementBits / 128) * (128 / ElementBits);
377 uint64_t Element = CDS->getElementAsInteger(i);
378 // Only the least significant 2 bits of the integer are used.
379 int Index = Base + (Element & 0x3);
380 ShuffleMask.push_back(Index);
381 }
382 } else if (auto *CV = dyn_cast(C)) {
383 assert((unsigned)NumElements == C->getNumOperands() &&
384 "Constant mask has a different number of elements!");
385
386 for (int i = 0; i < NumElements; ++i) {
387 int Base = (i * ElementBits / 128) * (128 / ElementBits);
388 Constant *COp = CV->getOperand(i);
389 if (isa(COp)) {
390 ShuffleMask.push_back(SM_SentinelUndef);
391 continue;
392 }
393 uint64_t Element = cast(COp)->getZExtValue();
394 // Only the least significant 2 bits of the integer are used.
395 int Index = Base + (Element & 0x3);
396 ShuffleMask.push_back(Index);
397 }
398 }
399 }
400
401 } // llvm namespace
98 unsigned NumLanes = VectorSizeInBits / 128;
99 unsigned NumLaneElts = NumElts / NumLanes;
100
101 for (unsigned l = 0; l < NumElts; l += NumLaneElts)
102 for (unsigned i = 0; i < NumLaneElts; ++i) {
103 int M = SM_SentinelZero;
104 if (i >= Imm) M = i - Imm + l;
105 ShuffleMask.push_back(M);
106 }
107 }
108
109 void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
110 unsigned VectorSizeInBits = VT.getSizeInBits();
111 unsigned NumElts = VectorSizeInBits / 8;
112 unsigned NumLanes = VectorSizeInBits / 128;
113 unsigned NumLaneElts = NumElts / NumLanes;
114
115 for (unsigned l = 0; l < NumElts; l += NumLaneElts)
116 for (unsigned i = 0; i < NumLaneElts; ++i) {
117 unsigned Base = i + Imm;
118 int M = Base + l;
119 if (Base >= NumLaneElts) M = SM_SentinelZero;
120 ShuffleMask.push_back(M);
121 }
122 }
123
124 void DecodePALIGNRMask(MVT VT, unsigned Imm,
125 SmallVectorImpl &ShuffleMask) {
126 unsigned NumElts = VT.getVectorNumElements();
127 unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
128
129 unsigned NumLanes = VT.getSizeInBits() / 128;
130 unsigned NumLaneElts = NumElts / NumLanes;
131
132 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
133 for (unsigned i = 0; i != NumLaneElts; ++i) {
134 unsigned Base = i + Offset;
135 // if i+offset is out of this lane then we actually need the other source
136 if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
137 ShuffleMask.push_back(Base + l);
138 }
139 }
140 }
141
142 /// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
143 /// VT indicates the type of the vector allowing it to handle different
144 /// datatypes and vector widths.
145 void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
146 unsigned NumElts = VT.getVectorNumElements();
147
148 unsigned NumLanes = VT.getSizeInBits() / 128;
149 unsigned NumLaneElts = NumElts / NumLanes;
150
151 unsigned NewImm = Imm;
152 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
153 for (unsigned i = 0; i != NumLaneElts; ++i) {
154 ShuffleMask.push_back(NewImm % NumLaneElts + l);
155 NewImm /= NumLaneElts;
156 }
157 if (NumLaneElts == 4) NewImm = Imm; // reload imm
158 }
159 }
160
161 void DecodePSHUFHWMask(MVT VT, unsigned Imm,
162 SmallVectorImpl &ShuffleMask) {
163 unsigned NumElts = VT.getVectorNumElements();
164
165 for (unsigned l = 0; l != NumElts; l += 8) {
166 unsigned NewImm = Imm;
167 for (unsigned i = 0, e = 4; i != e; ++i) {
168 ShuffleMask.push_back(l + i);
169 }
170 for (unsigned i = 4, e = 8; i != e; ++i) {
171 ShuffleMask.push_back(l + 4 + (NewImm & 3));
172 NewImm >>= 2;
173 }
174 }
175 }
176
177 void DecodePSHUFLWMask(MVT VT, unsigned Imm,
178 SmallVectorImpl &ShuffleMask) {
179 unsigned NumElts = VT.getVectorNumElements();
180
181 for (unsigned l = 0; l != NumElts; l += 8) {
182 unsigned NewImm = Imm;
183 for (unsigned i = 0, e = 4; i != e; ++i) {
184 ShuffleMask.push_back(l + (NewImm & 3));
185 NewImm >>= 2;
186 }
187 for (unsigned i = 4, e = 8; i != e; ++i) {
188 ShuffleMask.push_back(l + i);
189 }
190 }
191 }
192
193 /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
194 /// the type of the vector allowing it to handle different datatypes and vector
195 /// widths.
196 void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
197 unsigned NumElts = VT.getVectorNumElements();
198
199 unsigned NumLanes = VT.getSizeInBits() / 128;
200 unsigned NumLaneElts = NumElts / NumLanes;
201
202 unsigned NewImm = Imm;
203 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
204 // each half of a lane comes from different source
205 for (unsigned s = 0; s != NumElts*2; s += NumElts) {
206 for (unsigned i = 0; i != NumLaneElts/2; ++i) {
207 ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
208 NewImm /= NumLaneElts;
209 }
210 }
211 if (NumLaneElts == 4) NewImm = Imm; // reload imm
212 }
213 }
214
215 /// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
216 /// and punpckh*. VT indicates the type of the vector allowing it to handle
217 /// different datatypes and vector widths.
218 void DecodeUNPCKHMask(MVT VT, SmallVectorImpl &ShuffleMask) {
219 unsigned NumElts = VT.getVectorNumElements();
220
221 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
222 // independently on 128-bit lanes.
223 unsigned NumLanes = VT.getSizeInBits() / 128;
224 if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
225 unsigned NumLaneElts = NumElts / NumLanes;
226
227 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
228 for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
229 ShuffleMask.push_back(i); // Reads from dest/src1
230 ShuffleMask.push_back(i+NumElts); // Reads from src/src2
231 }
232 }
233 }
234
235 /// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
236 /// and punpckl*. VT indicates the type of the vector allowing it to handle
237 /// different datatypes and vector widths.
238 void DecodeUNPCKLMask(MVT VT, SmallVectorImpl &ShuffleMask) {
239 unsigned NumElts = VT.getVectorNumElements();
240
241 // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
242 // independently on 128-bit lanes.
243 unsigned NumLanes = VT.getSizeInBits() / 128;
244 if (NumLanes == 0 ) NumLanes = 1; // Handle MMX
245 unsigned NumLaneElts = NumElts / NumLanes;
246
247 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
248 for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
249 ShuffleMask.push_back(i); // Reads from dest/src1
250 ShuffleMask.push_back(i+NumElts); // Reads from src/src2
251 }
252 }
253 }
254
255 void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
256 SmallVectorImpl &ShuffleMask) {
257 if (Imm & 0x88)
258 return; // Not a shuffle
259
260 unsigned HalfSize = VT.getVectorNumElements()/2;
261
262 for (unsigned l = 0; l != 2; ++l) {
263 unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
264 for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
265 ShuffleMask.push_back(i);
266 }
267 }
268
269 void DecodePSHUFBMask(const Constant *C, SmallVectorImpl &ShuffleMask) {
270 Type *MaskTy = C->getType();
271 // It is not an error for the PSHUFB mask to not be a vector of i8 because the
272 // constant pool uniques constants by their bit representation.
273 // e.g. the following take up the same space in the constant pool:
274 // i128 -170141183420855150465331762880109871104
275 //
276 // <2 x i64>
277 //
278 // <4 x i32>
279 // i32 -2147483648, i32 -2147483648>
280
281 unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
282
283 if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512.
284 return;
285
286 // This is a straightforward byte vector.
287 if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) {
288 int NumElements = MaskTy->getVectorNumElements();
289 ShuffleMask.reserve(NumElements);
290
291 for (int i = 0; i < NumElements; ++i) {
292 // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte
293 // lane of the vector we're inside.
294 int Base = i < 16 ? 0 : 16;
295 Constant *COp = C->getAggregateElement(i);
296 if (!COp) {
297 ShuffleMask.clear();
298 return;
299 } else if (isa(COp)) {
300 ShuffleMask.push_back(SM_SentinelUndef);
301 continue;
302 }
303 uint64_t Element = cast(COp)->getZExtValue();
304 // If the high bit (7) of the byte is set, the element is zeroed.
305 if (Element & (1 << 7))
306 ShuffleMask.push_back(SM_SentinelZero);
307 else {
308 // Only the least significant 4 bits of the byte are used.
309 int Index = Base + (Element & 0xf);
310 ShuffleMask.push_back(Index);
311 }
312 }
313 }
314 // TODO: Handle funny-looking vectors too.
315 }
316
317 void DecodePSHUFBMask(ArrayRef RawMask,
318 SmallVectorImpl &ShuffleMask) {
319 for (int i = 0, e = RawMask.size(); i < e; ++i) {
320 uint64_t M = RawMask[i];
321 if (M == (uint64_t)SM_SentinelUndef) {
322 ShuffleMask.push_back(M);
323 continue;
324 }
325 // For AVX vectors with 32 bytes the base of the shuffle is the half of
326 // the vector we're inside.
327 int Base = i < 16 ? 0 : 16;
328 // If the high bit (7) of the byte is set, the element is zeroed.
329 if (M & (1 << 7))
330 ShuffleMask.push_back(SM_SentinelZero);
331 else {
332 // Only the least significant 4 bits of the byte are used.
333 int Index = Base + (M & 0xf);
334 ShuffleMask.push_back(Index);
335 }
336 }
337 }
338
339 void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl &ShuffleMask) {
340 int ElementBits = VT.getScalarSizeInBits();
341 int NumElements = VT.getVectorNumElements();
342 for (int i = 0; i < NumElements; ++i) {
343 // If there are more than 8 elements in the vector, then any immediate blend
344 // mask applies to each 128-bit lane. There can never be more than
345 // 8 elements in a 128-bit lane with an immediate blend.
346 int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
347 assert(Bit < 8 &&
348 "Immediate blends only operate over 8 elements at a time!");
349 ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
350 }
351 }
352
353 /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
354 /// No VT provided since it only works on 256-bit, 4 element vectors.
355 void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask) {
356 for (unsigned i = 0; i != 4; ++i) {
357 ShuffleMask.push_back((Imm >> (2*i)) & 3);
358 }
359 }
360
361 void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl &ShuffleMask) {
362 Type *MaskTy = C->getType();
363 assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
364 assert(MaskTy->getVectorElementType()->isIntegerTy() &&
365 "Expected integer constant mask elements!");
366 int ElementBits = MaskTy->getScalarSizeInBits();
367 int NumElements = MaskTy->getVectorNumElements();
368 assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
369 "Unexpected number of vector elements.");
370 ShuffleMask.reserve(NumElements);
371 if (auto *CDS = dyn_cast(C)) {
372 assert((unsigned)NumElements == CDS->getNumElements() &&
373 "Constant mask has a different number of elements!");
374
375 for (int i = 0; i < NumElements; ++i) {
376 int Base = (i * ElementBits / 128) * (128 / ElementBits);
377 uint64_t Element = CDS->getElementAsInteger(i);
378 // Only the least significant 2 bits of the integer are used.
379 int Index = Base + (Element & 0x3);
380 ShuffleMask.push_back(Index);
381 }
382 } else if (auto *CV = dyn_cast(C)) {
383 assert((unsigned)NumElements == C->getNumOperands() &&
384 "Constant mask has a different number of elements!");
385
386 for (int i = 0; i < NumElements; ++i) {
387 int Base = (i * ElementBits / 128) * (128 / ElementBits);
388 Constant *COp = CV->getOperand(i);
389 if (isa(COp)) {
390 ShuffleMask.push_back(SM_SentinelUndef);
391 continue;
392 }
393 uint64_t Element = cast(COp)->getZExtValue();
394 // Only the least significant 2 bits of the integer are used.
395 int Index = Base + (Element & 0x3);
396 ShuffleMask.push_back(Index);
397 }
398 }
399 }
400
401 void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl &Mask) {
402 unsigned NumSrcElts = SrcVT.getVectorNumElements();
403 unsigned NumDstElts = DstVT.getVectorNumElements();
404 unsigned SrcScalarBits = SrcVT.getScalarSizeInBits();
405 unsigned DstScalarBits = DstVT.getScalarSizeInBits();
406 unsigned Scale = DstScalarBits / SrcScalarBits;
407 assert(SrcScalarBits < DstScalarBits &&
408 "Expected zero extension mask to increase scalar size");
409 assert(NumSrcElts >= NumDstElts && "Too many zero extension lanes");
410
411 for (unsigned i = 0; i != NumDstElts; i++) {
412 Mask.push_back(i);
413 for (unsigned j = 1; j != Scale; j++)
414 Mask.push_back(SM_SentinelZero);
415 }
416 }
417
418 void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl &ShuffleMask) {
419 unsigned NumElts = VT.getVectorNumElements();
420 ShuffleMask.push_back(0);
421 for (unsigned i = 1; i < NumElts; i++)
422 ShuffleMask.push_back(SM_SentinelZero);
423 }
424
425 void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl &Mask) {
426 // First element comes from the first element of second source.
427 // Remaining elements: Load zero extends / Move copies from first source.
428 unsigned NumElts = VT.getVectorNumElements();
429 Mask.push_back(NumElts);
430 for (unsigned i = 1; i < NumElts; i++)
431 Mask.push_back(IsLoad ? SM_SentinelZero : i);
432 }
433 } // llvm namespace
8686 /// No VT provided since it only works on 256-bit, 4 element vectors.
8787 void DecodeVPERMMask(unsigned Imm, SmallVectorImpl &ShuffleMask);
8888
89 /// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
90 void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl &ShuffleMask);
91
92 } // llvm namespace
93
94 #endif
89 /// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
90 void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl &ShuffleMask);
91
92 /// \brief Decode a zero extension instruction as a shuffle mask.
93 void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT,
94 SmallVectorImpl &ShuffleMask);
95
96 /// \brief Decode a move lower and zero upper instruction as a shuffle mask.
97 void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl &ShuffleMask);
98
99 /// \brief Decode a scalar float move instruction as a shuffle mask.
100 void DecodeScalarMoveMask(MVT VT, bool IsLoad,
101 SmallVectorImpl &ShuffleMask);
102 } // llvm namespace
103
104 #endif
54955495 IsUnary = true;
54965496 break;
54975497 case X86ISD::MOVSS:
5498 case X86ISD::MOVSD: {
5499 // The index 0 always comes from the first element of the second source,
5500 // this is why MOVSS and MOVSD are used in the first place. The other
5501 // elements come from the other positions of the first source vector
5502 Mask.push_back(NumElems);
5503 for (unsigned i = 1; i != NumElems; ++i) {
5504 Mask.push_back(i);
5505 }
5498 case X86ISD::MOVSD:
5499 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
55065500 break;
5507 }
55085501 case X86ISD::VPERM2X128:
55095502 ImmN = N->getOperand(N->getNumOperands()-1);
55105503 DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask);
2473924732
2474024733 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
2474124734 }
24742
24735
2474324736 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
2474424737 Mld->getBasePtr(), NewMask, WideSrc0,
2474524738 Mld->getMemoryVT(), Mld->getMemOperand(),
2476924762 "Unexpected size for truncating masked store");
2477024763 // We are going to use the original vector elt for storing.
2477124764 // Accumulated smaller vector elements must be a multiple of the store size.
24772 assert (((NumElems * FromSz) % ToSz) == 0 &&
24765 assert (((NumElems * FromSz) % ToSz) == 0 &&
2477324766 "Unexpected ratio for truncating masked store");
2477424767
2477524768 unsigned SizeRatio = FromSz / ToSz;
349349 ; SSE2-NEXT: movdqa %xmm0, %xmm4
350350 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
351351 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
352 ; SSE2-NEXT: movsd %xmm4, %xmm3
352 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
353353 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
354354 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
355355 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
356356 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
357 ; SSE2-NEXT: movsd %xmm0, %xmm1
357 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
358358 ; SSE2-NEXT: packuswb %xmm3, %xmm1
359359 ; SSE2-NEXT: movdqa %xmm1, %xmm0
360360 ; SSE2-NEXT: retq
799799 ;
800800 ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
801801 ; SSE41: # BB#0:
802 ; SSE41-NEXT: pmovzxbq %xmm0, %xmm0
802 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
803803 ; SSE41-NEXT: retq
804804 ;
805805 ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
806806 ; AVX: # BB#0:
807 ; AVX-NEXT: vpmovzxbq %xmm0, %xmm0
807 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
808808 ; AVX-NEXT: retq
809809 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32>
810810 ret <16 x i8> %shuffle
826826 ;
827827 ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
828828 ; SSE41: # BB#0:
829 ; SSE41-NEXT: pmovzxbq %xmm0, %xmm0
829 ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
830830 ; SSE41-NEXT: retq
831831 ;
832832 ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
833833 ; AVX: # BB#0:
834 ; AVX-NEXT: vpmovzxbq %xmm0, %xmm0
834 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
835835 ; AVX-NEXT: retq
836836 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32>
837837 ret <16 x i8> %shuffle
852852 ;
853853 ; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
854854 ; SSE41: # BB#0:
855 ; SSE41-NEXT: pmovzxbd %xmm0, %xmm0
855 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
856856 ; SSE41-NEXT: retq
857857 ;
858858 ; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
859859 ; AVX: # BB#0:
860 ; AVX-NEXT: vpmovzxbd %xmm0, %xmm0
860 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
861861 ; AVX-NEXT: retq
862862 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32>
863863 ret <16 x i8> %shuffle
880880 ;
881881 ; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
882882 ; SSE41: # BB#0:
883 ; SSE41-NEXT: pmovzxbd %xmm0, %xmm0
883 ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
884884 ; SSE41-NEXT: retq
885885 ;
886886 ; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
887887 ; AVX: # BB#0:
888 ; AVX-NEXT: vpmovzxbd %xmm0, %xmm0
888 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
889889 ; AVX-NEXT: retq
890890 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32>
891891 ret <16 x i8> %shuffle
904904 ;
905905 ; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
906906 ; SSE41: # BB#0:
907 ; SSE41-NEXT: pmovzxbw %xmm0, %xmm0
907 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
908908 ; SSE41-NEXT: retq
909909 ;
910910 ; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
911911 ; AVX: # BB#0:
912 ; AVX-NEXT: vpmovzxbw %xmm0, %xmm0
912 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
913913 ; AVX-NEXT: retq
914914 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32>
915915 ret <16 x i8> %shuffle
930930 ;
931931 ; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
932932 ; SSE41: # BB#0:
933 ; SSE41-NEXT: pmovzxbw %xmm0, %xmm0
933 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
934934 ; SSE41-NEXT: retq
935935 ;
936936 ; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
937937 ; AVX: # BB#0:
938 ; AVX-NEXT: vpmovzxbw %xmm0, %xmm0
938 ; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
939939 ; AVX-NEXT: retq
940940 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32>
941941 ret <16 x i8> %shuffle
210210 define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
211211 ; SSE2-LABEL: shuffle_v2f64_03:
212212 ; SSE2: # BB#0:
213 ; SSE2-NEXT: movsd %xmm0, %xmm1
213 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
214214 ; SSE2-NEXT: movaps %xmm1, %xmm0
215215 ; SSE2-NEXT: retq
216216 ;
217217 ; SSE3-LABEL: shuffle_v2f64_03:
218218 ; SSE3: # BB#0:
219 ; SSE3-NEXT: movsd %xmm0, %xmm1
219 ; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
220220 ; SSE3-NEXT: movaps %xmm1, %xmm0
221221 ; SSE3-NEXT: retq
222222 ;
223223 ; SSSE3-LABEL: shuffle_v2f64_03:
224224 ; SSSE3: # BB#0:
225 ; SSSE3-NEXT: movsd %xmm0, %xmm1
225 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
226226 ; SSSE3-NEXT: movaps %xmm1, %xmm0
227227 ; SSSE3-NEXT: retq
228228 ;
241241 define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
242242 ; SSE2-LABEL: shuffle_v2f64_21:
243243 ; SSE2: # BB#0:
244 ; SSE2-NEXT: movsd %xmm1, %xmm0
244 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
245245 ; SSE2-NEXT: retq
246246 ;
247247 ; SSE3-LABEL: shuffle_v2f64_21:
248248 ; SSE3: # BB#0:
249 ; SSE3-NEXT: movsd %xmm1, %xmm0
249 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
250250 ; SSE3-NEXT: retq
251251 ;
252252 ; SSSE3-LABEL: shuffle_v2f64_21:
253253 ; SSSE3: # BB#0:
254 ; SSSE3-NEXT: movsd %xmm1, %xmm0
254 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
255255 ; SSSE3-NEXT: retq
256256 ;
257257 ; SSE41-LABEL: shuffle_v2f64_21:
298298 define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
299299 ; SSE2-LABEL: shuffle_v2i64_03:
300300 ; SSE2: # BB#0:
301 ; SSE2-NEXT: movsd %xmm0, %xmm1
301 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
302302 ; SSE2-NEXT: movaps %xmm1, %xmm0
303303 ; SSE2-NEXT: retq
304304 ;
305305 ; SSE3-LABEL: shuffle_v2i64_03:
306306 ; SSE3: # BB#0:
307 ; SSE3-NEXT: movsd %xmm0, %xmm1
307 ; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
308308 ; SSE3-NEXT: movaps %xmm1, %xmm0
309309 ; SSE3-NEXT: retq
310310 ;
311311 ; SSSE3-LABEL: shuffle_v2i64_03:
312312 ; SSSE3: # BB#0:
313 ; SSSE3-NEXT: movsd %xmm0, %xmm1
313 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
314314 ; SSSE3-NEXT: movaps %xmm1, %xmm0
315315 ; SSSE3-NEXT: retq
316316 ;
334334 define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
335335 ; SSE2-LABEL: shuffle_v2i64_03_copy:
336336 ; SSE2: # BB#0:
337 ; SSE2-NEXT: movsd %xmm1, %xmm2
337 ; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
338338 ; SSE2-NEXT: movaps %xmm2, %xmm0
339339 ; SSE2-NEXT: retq
340340 ;
341341 ; SSE3-LABEL: shuffle_v2i64_03_copy:
342342 ; SSE3: # BB#0:
343 ; SSE3-NEXT: movsd %xmm1, %xmm2
343 ; SSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
344344 ; SSE3-NEXT: movaps %xmm2, %xmm0
345345 ; SSE3-NEXT: retq
346346 ;
347347 ; SSSE3-LABEL: shuffle_v2i64_03_copy:
348348 ; SSSE3: # BB#0:
349 ; SSSE3-NEXT: movsd %xmm1, %xmm2
349 ; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
350350 ; SSSE3-NEXT: movaps %xmm2, %xmm0
351351 ; SSSE3-NEXT: retq
352352 ;
488488 define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
489489 ; SSE2-LABEL: shuffle_v2i64_21:
490490 ; SSE2: # BB#0:
491 ; SSE2-NEXT: movsd %xmm1, %xmm0
491 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
492492 ; SSE2-NEXT: retq
493493 ;
494494 ; SSE3-LABEL: shuffle_v2i64_21:
495495 ; SSE3: # BB#0:
496 ; SSE3-NEXT: movsd %xmm1, %xmm0
496 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
497497 ; SSE3-NEXT: retq
498498 ;
499499 ; SSSE3-LABEL: shuffle_v2i64_21:
500500 ; SSSE3: # BB#0:
501 ; SSSE3-NEXT: movsd %xmm1, %xmm0
501 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
502502 ; SSSE3-NEXT: retq
503503 ;
504504 ; SSE41-LABEL: shuffle_v2i64_21:
521521 define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
522522 ; SSE2-LABEL: shuffle_v2i64_21_copy:
523523 ; SSE2: # BB#0:
524 ; SSE2-NEXT: movsd %xmm2, %xmm1
524 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
525525 ; SSE2-NEXT: movaps %xmm1, %xmm0
526526 ; SSE2-NEXT: retq
527527 ;
528528 ; SSE3-LABEL: shuffle_v2i64_21_copy:
529529 ; SSE3: # BB#0:
530 ; SSE3-NEXT: movsd %xmm2, %xmm1
530 ; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
531531 ; SSE3-NEXT: movaps %xmm1, %xmm0
532532 ; SSE3-NEXT: retq
533533 ;
534534 ; SSSE3-LABEL: shuffle_v2i64_21_copy:
535535 ; SSSE3: # BB#0:
536 ; SSSE3-NEXT: movsd %xmm2, %xmm1
536 ; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
537537 ; SSSE3-NEXT: movaps %xmm1, %xmm0
538538 ; SSSE3-NEXT: retq
539539 ;
649649 define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) {
650650 ; SSE-LABEL: shuffle_v2i64_0z:
651651 ; SSE: # BB#0:
652 ; SSE-NEXT: movq %xmm0, %xmm0
652 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
653653 ; SSE-NEXT: retq
654654 ;
655655 ; AVX-LABEL: shuffle_v2i64_0z:
656656 ; AVX: # BB#0:
657 ; AVX-NEXT: vmovq %xmm0, %xmm0
657 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
658658 ; AVX-NEXT: retq
659659 %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32>
660660 ret <2 x i64> %shuffle
692692 ; SSE2-LABEL: shuffle_v2i64_z1:
693693 ; SSE2: # BB#0:
694694 ; SSE2-NEXT: xorps %xmm1, %xmm1
695 ; SSE2-NEXT: movsd %xmm1, %xmm0
695 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
696696 ; SSE2-NEXT: retq
697697 ;
698698 ; SSE3-LABEL: shuffle_v2i64_z1:
699699 ; SSE3: # BB#0:
700700 ; SSE3-NEXT: xorps %xmm1, %xmm1
701 ; SSE3-NEXT: movsd %xmm1, %xmm0
701 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
702702 ; SSE3-NEXT: retq
703703 ;
704704 ; SSSE3-LABEL: shuffle_v2i64_z1:
705705 ; SSSE3: # BB#0:
706706 ; SSSE3-NEXT: xorps %xmm1, %xmm1
707 ; SSSE3-NEXT: movsd %xmm1, %xmm0
707 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
708708 ; SSSE3-NEXT: retq
709709 ;
710710 ; SSE41-LABEL: shuffle_v2i64_z1:
731731 define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) {
732732 ; SSE-LABEL: shuffle_v2f64_0z:
733733 ; SSE: # BB#0:
734 ; SSE-NEXT: movq %xmm0, %xmm0
734 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
735735 ; SSE-NEXT: retq
736736 ;
737737 ; AVX-LABEL: shuffle_v2f64_0z:
738738 ; AVX: # BB#0:
739 ; AVX-NEXT: vmovq %xmm0, %xmm0
739 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
740740 ; AVX-NEXT: retq
741741 %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32>
742742 ret <2 x double> %shuffle
779779 ; SSE2-LABEL: shuffle_v2f64_z1:
780780 ; SSE2: # BB#0:
781781 ; SSE2-NEXT: xorps %xmm1, %xmm1
782 ; SSE2-NEXT: movsd %xmm1, %xmm0
782 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
783783 ; SSE2-NEXT: retq
784784 ;
785785 ; SSE3-LABEL: shuffle_v2f64_z1:
786786 ; SSE3: # BB#0:
787787 ; SSE3-NEXT: xorps %xmm1, %xmm1
788 ; SSE3-NEXT: movsd %xmm1, %xmm0
788 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
789789 ; SSE3-NEXT: retq
790790 ;
791791 ; SSSE3-LABEL: shuffle_v2f64_z1:
792792 ; SSSE3: # BB#0:
793793 ; SSSE3-NEXT: xorps %xmm1, %xmm1
794 ; SSSE3-NEXT: movsd %xmm1, %xmm0
794 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
795795 ; SSSE3-NEXT: retq
796796 ;
797797 ; SSE41-LABEL: shuffle_v2f64_z1:
827827 define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
828828 ; SSE-LABEL: insert_mem_and_zero_v2i64:
829829 ; SSE: # BB#0:
830 ; SSE-NEXT: movq (%rdi), %xmm0
830 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
831831 ; SSE-NEXT: retq
832832 ;
833833 ; AVX-LABEL: insert_mem_and_zero_v2i64:
834834 ; AVX: # BB#0:
835 ; AVX-NEXT: vmovq (%rdi), %xmm0
835 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
836836 ; AVX-NEXT: retq
837837 %a = load i64* %ptr
838838 %v = insertelement <2 x i64> undef, i64 %a, i32 0
843843 define <2 x double> @insert_reg_and_zero_v2f64(double %a) {
844844 ; SSE-LABEL: insert_reg_and_zero_v2f64:
845845 ; SSE: # BB#0:
846 ; SSE-NEXT: movq %xmm0, %xmm0
846 ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
847847 ; SSE-NEXT: retq
848848 ;
849849 ; AVX-LABEL: insert_reg_and_zero_v2f64:
850850 ; AVX: # BB#0:
851 ; AVX-NEXT: vmovq %xmm0, %xmm0
851 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
852852 ; AVX-NEXT: retq
853853 %v = insertelement <2 x double> undef, double %a, i32 0
854854 %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32>
858858 define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
859859 ; SSE-LABEL: insert_mem_and_zero_v2f64:
860860 ; SSE: # BB#0:
861 ; SSE-NEXT: movsd (%rdi), %xmm0
861 ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
862862 ; SSE-NEXT: retq
863863 ;
864864 ; AVX-LABEL: insert_mem_and_zero_v2f64:
865865 ; AVX: # BB#0:
866 ; AVX-NEXT: vmovsd (%rdi), %xmm0
866 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
867867 ; AVX-NEXT: retq
868868 %a = load double* %ptr
869869 %v = insertelement <2 x double> undef, double %a, i32 0
875875 ; SSE2-LABEL: insert_reg_lo_v2i64:
876876 ; SSE2: # BB#0:
877877 ; SSE2-NEXT: movd %rdi, %xmm1
878 ; SSE2-NEXT: movsd %xmm1, %xmm0
878 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
879879 ; SSE2-NEXT: retq
880880 ;
881881 ; SSE3-LABEL: insert_reg_lo_v2i64:
882882 ; SSE3: # BB#0:
883883 ; SSE3-NEXT: movd %rdi, %xmm1
884 ; SSE3-NEXT: movsd %xmm1, %xmm0
884 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
885885 ; SSE3-NEXT: retq
886886 ;
887887 ; SSSE3-LABEL: insert_reg_lo_v2i64:
888888 ; SSSE3: # BB#0:
889889 ; SSSE3-NEXT: movd %rdi, %xmm1
890 ; SSSE3-NEXT: movsd %xmm1, %xmm0
890 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
891891 ; SSSE3-NEXT: retq
892892 ;
893893 ; SSE41-LABEL: insert_reg_lo_v2i64:
930930 ;
931931 ; SSE41-LABEL: insert_mem_lo_v2i64:
932932 ; SSE41: # BB#0:
933 ; SSE41-NEXT: movq (%rdi), %xmm1
933 ; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
934934 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
935935 ; SSE41-NEXT: retq
936936 ;
937937 ; AVX1-LABEL: insert_mem_lo_v2i64:
938938 ; AVX1: # BB#0:
939 ; AVX1-NEXT: vmovq (%rdi), %xmm1
939 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
940940 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
941941 ; AVX1-NEXT: retq
942942 ;
943943 ; AVX2-LABEL: insert_mem_lo_v2i64:
944944 ; AVX2: # BB#0:
945 ; AVX2-NEXT: vmovq (%rdi), %xmm1
945 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
946946 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
947947 ; AVX2-NEXT: retq
948948 %a = load i64* %ptr
971971 define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
972972 ; SSE-LABEL: insert_mem_hi_v2i64:
973973 ; SSE: # BB#0:
974 ; SSE-NEXT: movq (%rdi), %xmm1
974 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
975975 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
976976 ; SSE-NEXT: retq
977977 ;
978978 ; AVX-LABEL: insert_mem_hi_v2i64:
979979 ; AVX: # BB#0:
980 ; AVX-NEXT: vmovq (%rdi), %xmm1
980 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
981981 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
982982 ; AVX-NEXT: retq
983983 %a = load i64* %ptr
989989 define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
990990 ; SSE-LABEL: insert_reg_lo_v2f64:
991991 ; SSE: # BB#0:
992 ; SSE-NEXT: movsd %xmm0, %xmm1
992 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
993993 ; SSE-NEXT: movaps %xmm1, %xmm0
994994 ; SSE-NEXT: retq
995995 ;
996996 ; AVX-LABEL: insert_reg_lo_v2f64:
997997 ; AVX: # BB#0:
998 ; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0
998 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
999999 ; AVX-NEXT: retq
10001000 %v = insertelement <2 x double> undef, double %a, i32 0
10011001 %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32>
10841084 define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
10851085 ; SSE2-LABEL: insert_dup_mem_v2f64:
10861086 ; SSE2: # BB#0:
1087 ; SSE2-NEXT: movsd (%rdi), %xmm0
1087 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
10881088 ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
10891089 ; SSE2-NEXT: retq
10901090 ;
440440 ; SSE2-LABEL: shuffle_v4f32_4zzz:
441441 ; SSE2: # BB#0:
442442 ; SSE2-NEXT: xorps %xmm1, %xmm1
443 ; SSE2-NEXT: movss %xmm0, %xmm1
443 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
444444 ; SSE2-NEXT: movaps %xmm1, %xmm0
445445 ; SSE2-NEXT: retq
446446 ;
447447 ; SSE3-LABEL: shuffle_v4f32_4zzz:
448448 ; SSE3: # BB#0:
449449 ; SSE3-NEXT: xorps %xmm1, %xmm1
450 ; SSE3-NEXT: movss %xmm0, %xmm1
450 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
451451 ; SSE3-NEXT: movaps %xmm1, %xmm0
452452 ; SSE3-NEXT: retq
453453 ;
454454 ; SSSE3-LABEL: shuffle_v4f32_4zzz:
455455 ; SSSE3: # BB#0:
456456 ; SSSE3-NEXT: xorps %xmm1, %xmm1
457 ; SSSE3-NEXT: movss %xmm0, %xmm1
457 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
458458 ; SSSE3-NEXT: movaps %xmm1, %xmm0
459459 ; SSSE3-NEXT: retq
460460 ;
660660 ; SSE2-LABEL: shuffle_v4i32_4zzz:
661661 ; SSE2: # BB#0:
662662 ; SSE2-NEXT: xorps %xmm1, %xmm1
663 ; SSE2-NEXT: movss %xmm0, %xmm1
663 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
664664 ; SSE2-NEXT: movaps %xmm1, %xmm0
665665 ; SSE2-NEXT: retq
666666 ;
667667 ; SSE3-LABEL: shuffle_v4i32_4zzz:
668668 ; SSE3: # BB#0:
669669 ; SSE3-NEXT: xorps %xmm1, %xmm1
670 ; SSE3-NEXT: movss %xmm0, %xmm1
670 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
671671 ; SSE3-NEXT: movaps %xmm1, %xmm0
672672 ; SSE3-NEXT: retq
673673 ;
674674 ; SSSE3-LABEL: shuffle_v4i32_4zzz:
675675 ; SSSE3: # BB#0:
676676 ; SSSE3-NEXT: xorps %xmm1, %xmm1
677 ; SSSE3-NEXT: movss %xmm0, %xmm1
677 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
678678 ; SSSE3-NEXT: movaps %xmm1, %xmm0
679679 ; SSSE3-NEXT: retq
680680 ;
697697 ; SSE2-LABEL: shuffle_v4i32_z4zz:
698698 ; SSE2: # BB#0:
699699 ; SSE2-NEXT: xorps %xmm1, %xmm1
700 ; SSE2-NEXT: movss %xmm0, %xmm1
700 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
701701 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
702702 ; SSE2-NEXT: retq
703703 ;
704704 ; SSE3-LABEL: shuffle_v4i32_z4zz:
705705 ; SSE3: # BB#0:
706706 ; SSE3-NEXT: xorps %xmm1, %xmm1
707 ; SSE3-NEXT: movss %xmm0, %xmm1
707 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
708708 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
709709 ; SSE3-NEXT: retq
710710 ;
711711 ; SSSE3-LABEL: shuffle_v4i32_z4zz:
712712 ; SSSE3: # BB#0:
713713 ; SSSE3-NEXT: xorps %xmm1, %xmm1
714 ; SSSE3-NEXT: movss %xmm0, %xmm1
714 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
715715 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
716716 ; SSSE3-NEXT: retq
717717 ;
736736 ; SSE2-LABEL: shuffle_v4i32_zz4z:
737737 ; SSE2: # BB#0:
738738 ; SSE2-NEXT: xorps %xmm1, %xmm1
739 ; SSE2-NEXT: movss %xmm0, %xmm1
739 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
740740 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
741741 ; SSE2-NEXT: retq
742742 ;
743743 ; SSE3-LABEL: shuffle_v4i32_zz4z:
744744 ; SSE3: # BB#0:
745745 ; SSE3-NEXT: xorps %xmm1, %xmm1
746 ; SSE3-NEXT: movss %xmm0, %xmm1
746 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
747747 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
748748 ; SSE3-NEXT: retq
749749 ;
750750 ; SSSE3-LABEL: shuffle_v4i32_zz4z:
751751 ; SSSE3: # BB#0:
752752 ; SSSE3-NEXT: xorps %xmm1, %xmm1
753 ; SSSE3-NEXT: movss %xmm0, %xmm1
753 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
754754 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
755755 ; SSSE3-NEXT: retq
756756 ;
10321032 ;
10331033 ; SSE41-LABEL: shuffle_v4i32_0u1u:
10341034 ; SSE41: # BB#0:
1035 ; SSE41-NEXT: pmovzxdq %xmm0, %xmm0
1035 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
10361036 ; SSE41-NEXT: retq
10371037 ;
10381038 ; AVX-LABEL: shuffle_v4i32_0u1u:
10391039 ; AVX: # BB#0:
1040 ; AVX-NEXT: vpmovzxdq %xmm0, %xmm0
1040 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
10411041 ; AVX-NEXT: retq
10421042 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32>
10431043 ret <4 x i32> %shuffle
10641064 ;
10651065 ; SSE41-LABEL: shuffle_v4i32_0z1z:
10661066 ; SSE41: # BB#0:
1067 ; SSE41-NEXT: pmovzxdq %xmm0, %xmm0
1067 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
10681068 ; SSE41-NEXT: retq
10691069 ;
10701070 ; AVX-LABEL: shuffle_v4i32_0z1z:
10711071 ; AVX: # BB#0:
1072 ; AVX-NEXT: vpmovzxdq %xmm0, %xmm0
1072 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
10731073 ; AVX-NEXT: retq
10741074 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32>
10751075 ret <4 x i32> %shuffle
10931093 define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
10941094 ; SSE-LABEL: insert_mem_and_zero_v4i32:
10951095 ; SSE: # BB#0:
1096 ; SSE-NEXT: movd (%rdi), %xmm0
1096 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
10971097 ; SSE-NEXT: retq
10981098 ;
10991099 ; AVX-LABEL: insert_mem_and_zero_v4i32:
11001100 ; AVX: # BB#0:
1101 ; AVX-NEXT: vmovd (%rdi), %xmm0
1101 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
11021102 ; AVX-NEXT: retq
11031103 %a = load i32* %ptr
11041104 %v = insertelement <4 x i32> undef, i32 %a, i32 0
11101110 ; SSE2-LABEL: insert_reg_and_zero_v4f32:
11111111 ; SSE2: # BB#0:
11121112 ; SSE2-NEXT: xorps %xmm1, %xmm1
1113 ; SSE2-NEXT: movss %xmm0, %xmm1
1113 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
11141114 ; SSE2-NEXT: movaps %xmm1, %xmm0
11151115 ; SSE2-NEXT: retq
11161116 ;
11171117 ; SSE3-LABEL: insert_reg_and_zero_v4f32:
11181118 ; SSE3: # BB#0:
11191119 ; SSE3-NEXT: xorps %xmm1, %xmm1
1120 ; SSE3-NEXT: movss %xmm0, %xmm1
1120 ; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
11211121 ; SSE3-NEXT: movaps %xmm1, %xmm0
11221122 ; SSE3-NEXT: retq
11231123 ;
11241124 ; SSSE3-LABEL: insert_reg_and_zero_v4f32:
11251125 ; SSSE3: # BB#0:
11261126 ; SSSE3-NEXT: xorps %xmm1, %xmm1
1127 ; SSSE3-NEXT: movss %xmm0, %xmm1
1127 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
11281128 ; SSSE3-NEXT: movaps %xmm1, %xmm0
11291129 ; SSSE3-NEXT: retq
11301130 ;
11371137 ; AVX-LABEL: insert_reg_and_zero_v4f32:
11381138 ; AVX: # BB#0:
11391139 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1140 ; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0
1140 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
11411141 ; AVX-NEXT: retq
11421142 %v = insertelement <4 x float> undef, float %a, i32 0
11431143 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32>
11471147 define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
11481148 ; SSE-LABEL: insert_mem_and_zero_v4f32:
11491149 ; SSE: # BB#0:
1150 ; SSE-NEXT: movss (%rdi), %xmm0
1150 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
11511151 ; SSE-NEXT: retq
11521152 ;
11531153 ; AVX-LABEL: insert_mem_and_zero_v4f32:
11541154 ; AVX: # BB#0:
1155 ; AVX-NEXT: vmovss (%rdi), %xmm0
1155 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
11561156 ; AVX-NEXT: retq
11571157 %a = load float* %ptr
11581158 %v = insertelement <4 x float> undef, float %a, i32 0
11641164 ; SSE2-LABEL: insert_reg_lo_v4i32:
11651165 ; SSE2: # BB#0:
11661166 ; SSE2-NEXT: movd %rdi, %xmm1
1167 ; SSE2-NEXT: movsd %xmm1, %xmm0
1167 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
11681168 ; SSE2-NEXT: retq
11691169 ;
11701170 ; SSE3-LABEL: insert_reg_lo_v4i32:
11711171 ; SSE3: # BB#0:
11721172 ; SSE3-NEXT: movd %rdi, %xmm1
1173 ; SSE3-NEXT: movsd %xmm1, %xmm0
1173 ; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
11741174 ; SSE3-NEXT: retq
11751175 ;
11761176 ; SSSE3-LABEL: insert_reg_lo_v4i32:
11771177 ; SSSE3: # BB#0:
11781178 ; SSSE3-NEXT: movd %rdi, %xmm1
1179 ; SSSE3-NEXT: movsd %xmm1, %xmm0
1179 ; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
11801180 ; SSSE3-NEXT: retq
11811181 ;
11821182 ; SSE41-LABEL: insert_reg_lo_v4i32:
12201220 ;
12211221 ; SSE41-LABEL: insert_mem_lo_v4i32:
12221222 ; SSE41: # BB#0:
1223 ; SSE41-NEXT: movq (%rdi), %xmm1
1223 ; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
12241224 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
12251225 ; SSE41-NEXT: retq
12261226 ;
12271227 ; AVX1-LABEL: insert_mem_lo_v4i32:
12281228 ; AVX1: # BB#0:
1229 ; AVX1-NEXT: vmovq (%rdi), %xmm1
1229 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
12301230 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
12311231 ; AVX1-NEXT: retq
12321232 ;
12331233 ; AVX2-LABEL: insert_mem_lo_v4i32:
12341234 ; AVX2: # BB#0:
1235 ; AVX2-NEXT: vmovq (%rdi), %xmm1
1235 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
12361236 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12371237 ; AVX2-NEXT: retq
12381238 %a = load <2 x i32>* %ptr
12621262 define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
12631263 ; SSE-LABEL: insert_mem_hi_v4i32:
12641264 ; SSE: # BB#0:
1265 ; SSE-NEXT: movq (%rdi), %xmm1
1265 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
12661266 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
12671267 ; SSE-NEXT: retq
12681268 ;
12691269 ; AVX-LABEL: insert_mem_hi_v4i32:
12701270 ; AVX: # BB#0:
1271 ; AVX-NEXT: vmovq (%rdi), %xmm1
1271 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
12721272 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
12731273 ; AVX-NEXT: retq
12741274 %a = load <2 x i32>* %ptr
12801280 define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
12811281 ; SSE-LABEL: insert_reg_lo_v4f32:
12821282 ; SSE: # BB#0:
1283 ; SSE-NEXT: movsd %xmm0, %xmm1
1283 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12841284 ; SSE-NEXT: movaps %xmm1, %xmm0
12851285 ; SSE-NEXT: retq
12861286 ;
12871287 ; AVX-LABEL: insert_reg_lo_v4f32:
12881288 ; AVX: # BB#0:
1289 ; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0
1289 ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
12901290 ; AVX-NEXT: retq
12911291 %a.cast = bitcast double %a to <2 x float>
12921292 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32>
18281828 ;
18291829 ; SSE41-LABEL: shuffle_v8i16_0uuu1uuu:
18301830 ; SSE41: # BB#0:
1831 ; SSE41-NEXT: pmovzxwq %xmm0, %xmm0
1831 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
18321832 ; SSE41-NEXT: retq
18331833 ;
18341834 ; AVX-LABEL: shuffle_v8i16_0uuu1uuu:
18351835 ; AVX: # BB#0:
1836 ; AVX-NEXT: vpmovzxwq %xmm0, %xmm0
1836 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
18371837 ; AVX-NEXT: retq
18381838 %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32>
18391839 ret <8 x i16> %shuffle
18561856 ;
18571857 ; SSE41-LABEL: shuffle_v8i16_0zzz1zzz:
18581858 ; SSE41: # BB#0:
1859 ; SSE41-NEXT: pmovzxwq %xmm0, %xmm0
1859 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
18601860 ; SSE41-NEXT: retq
18611861 ;
18621862 ; AVX-LABEL: shuffle_v8i16_0zzz1zzz:
18631863 ; AVX: # BB#0:
1864 ; AVX-NEXT: vpmovzxwq %xmm0, %xmm0
1864 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
18651865 ; AVX-NEXT: retq
18661866 %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32>
18671867 ret <8 x i16> %shuffle
18801880 ;
18811881 ; SSE41-LABEL: shuffle_v8i16_0u1u2u3u:
18821882 ; SSE41: # BB#0:
1883 ; SSE41-NEXT: pmovzxwd %xmm0, %xmm0
1883 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
18841884 ; SSE41-NEXT: retq
18851885 ;
18861886 ; AVX-LABEL: shuffle_v8i16_0u1u2u3u:
18871887 ; AVX: # BB#0:
1888 ; AVX-NEXT: vpmovzxwd %xmm0, %xmm0
1888 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
18891889 ; AVX-NEXT: retq
18901890 %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32>
18911891 ret <8 x i16> %shuffle
19061906 ;
19071907 ; SSE41-LABEL: shuffle_v8i16_0z1z2z3z:
19081908 ; SSE41: # BB#0:
1909 ; SSE41-NEXT: pmovzxwd %xmm0, %xmm0
1909 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
19101910 ; SSE41-NEXT: retq
19111911 ;
19121912 ; AVX-LABEL: shuffle_v8i16_0z1z2z3z:
19131913 ; AVX: # BB#0:
1914 ; AVX-NEXT: vpmovzxwd %xmm0, %xmm0
1914 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
19151915 ; AVX-NEXT: retq
19161916 %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32>
19171917 ret <8 x i16> %shuffle
22
33 target triple = "x86_64-unknown-unknown"
44
5 define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
6 ; AVX1-LABEL: shuffle_v4f64_0000:
7 ; AVX1: # BB#0:
8 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
9 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10 ; AVX1-NEXT: retq
11 ;
5 define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
6 ; AVX1-LABEL: shuffle_v4f64_0000:
7 ; AVX1: # BB#0:
8 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
9 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10 ; AVX1-NEXT: retq
11 ;
1212 ; AVX2-LABEL: shuffle_v4f64_0000:
1313 ; AVX2: # BB#0:
1414 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
1717 ret <4 x double> %shuffle
1818 }
1919
20 define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
21 ; AVX1-LABEL: shuffle_v4f64_0001:
22 ; AVX1: # BB#0:
23 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
24 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
25 ; AVX1-NEXT: retq
26 ;
20 define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
21 ; AVX1-LABEL: shuffle_v4f64_0001:
22 ; AVX1: # BB#0:
23 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
24 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
25 ; AVX1-NEXT: retq
26 ;
2727 ; AVX2-LABEL: shuffle_v4f64_0001:
2828 ; AVX2: # BB#0:
2929 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
3434
3535 define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
3636 ; AVX1-LABEL: shuffle_v4f64_0020:
37 ; AVX1: # BB#0:
38 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
39 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
40 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
41 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
42 ; AVX1-NEXT: retq
43 ;
37 ; AVX1: # BB#0:
38 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
39 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
40 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
41 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
42 ; AVX1-NEXT: retq
43 ;
4444 ; AVX2-LABEL: shuffle_v4f64_0020:
4545 ; AVX2: # BB#0:
4646 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
6666 }
6767
6868 define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
69 ; AVX1-LABEL: shuffle_v4f64_1000:
70 ; AVX1: # BB#0:
71 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
72 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
73 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
74 ; AVX1-NEXT: retq
75 ;
69 ; AVX1-LABEL: shuffle_v4f64_1000:
70 ; AVX1: # BB#0:
71 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
72 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
73 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
74 ; AVX1-NEXT: retq
75 ;
7676 ; AVX2-LABEL: shuffle_v4f64_1000:
7777 ; AVX2: # BB#0:
7878 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
8282 }
8383
8484 define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
85 ; AVX1-LABEL: shuffle_v4f64_2200:
86 ; AVX1: # BB#0:
87 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
88 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
89 ; AVX1-NEXT: retq
90 ;
91 ; AVX2-LABEL: shuffle_v4f64_2200:
85 ; AVX1-LABEL: shuffle_v4f64_2200:
86 ; AVX1: # BB#0:
87 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
88 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
89 ; AVX1-NEXT: retq
90 ;
91 ; AVX2-LABEL: shuffle_v4f64_2200:
9292 ; AVX2: # BB#0:
9393 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
9494 ; AVX2-NEXT: retq
137137 ret <4 x double> %shuffle
138138 }
139139
140 define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
141 ; ALL-LABEL: shuffle_v4f64_0022:
142 ; ALL: # BB#0:
143 ; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
144 ; ALL-NEXT: retq
145 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
146 ret <4 x double> %shuffle
140 define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
141 ; ALL-LABEL: shuffle_v4f64_0022:
142 ; ALL: # BB#0:
143 ; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
144 ; ALL-NEXT: retq
145 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
146 ret <4 x double> %shuffle
147147 }
148148
149149 define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
182182 ret <4 x double> %shuffle
183183 }
184184
185 define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
186 ; AVX1-LABEL: shuffle_v4f64_0423:
187 ; AVX1: # BB#0:
188 ; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
189 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
190 ; AVX1-NEXT: retq
191 ;
185 define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
186 ; AVX1-LABEL: shuffle_v4f64_0423:
187 ; AVX1: # BB#0:
188 ; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
189 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
190 ; AVX1-NEXT: retq
191 ;
192192 ; AVX2-LABEL: shuffle_v4f64_0423:
193193 ; AVX2: # BB#0:
194194 ; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
198198 ret <4 x double> %shuffle
199199 }
200200
201 define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
202 ; ALL-LABEL: shuffle_v4f64_0462:
203 ; ALL: # BB#0:
204 ; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
205 ; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
206 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
207 ; ALL-NEXT: retq
208 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
201 define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
202 ; ALL-LABEL: shuffle_v4f64_0462:
203 ; ALL: # BB#0:
204 ; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
205 ; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
206 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
207 ; ALL-NEXT: retq
208 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32>
209209 ret <4 x double> %shuffle
210210 }
211211
357357 ret <4 x double> %shuffle
358358 }
359359
360 define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
361 ; AVX1-LABEL: shuffle_v4i64_0000:
362 ; AVX1: # BB#0:
363 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
364 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
365 ; AVX1-NEXT: retq
366 ;
360 define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
361 ; AVX1-LABEL: shuffle_v4i64_0000:
362 ; AVX1: # BB#0:
363 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
364 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
365 ; AVX1-NEXT: retq
366 ;
367367 ; AVX2-LABEL: shuffle_v4i64_0000:
368368 ; AVX2: # BB#0:
369369 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
372372 ret <4 x i64> %shuffle
373373 }
374374
375 define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
376 ; AVX1-LABEL: shuffle_v4i64_0001:
377 ; AVX1: # BB#0:
378 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
379 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
380 ; AVX1-NEXT: retq
381 ;
375 define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
376 ; AVX1-LABEL: shuffle_v4i64_0001:
377 ; AVX1: # BB#0:
378 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
379 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
380 ; AVX1-NEXT: retq
381 ;
382382 ; AVX2-LABEL: shuffle_v4i64_0001:
383383 ; AVX2: # BB#0:
384384 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
389389
390390 define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
391391 ; AVX1-LABEL: shuffle_v4i64_0020:
392 ; AVX1: # BB#0:
393 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
394 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
395 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
396 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
397 ; AVX1-NEXT: retq
398 ;
392 ; AVX1: # BB#0:
393 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
394 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
395 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
396 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
397 ; AVX1-NEXT: retq
398 ;
399399 ; AVX2-LABEL: shuffle_v4i64_0020:
400400 ; AVX2: # BB#0:
401401 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
437437 }
438438
439439 define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
440 ; AVX1-LABEL: shuffle_v4i64_1000:
441 ; AVX1: # BB#0:
442 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
443 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
444 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
445 ; AVX1-NEXT: retq
446 ;
440 ; AVX1-LABEL: shuffle_v4i64_1000:
441 ; AVX1: # BB#0:
442 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
443 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
444 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
445 ; AVX1-NEXT: retq
446 ;
447447 ; AVX2-LABEL: shuffle_v4i64_1000:
448448 ; AVX2: # BB#0:
449449 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
453453 }
454454
455455 define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
456 ; AVX1-LABEL: shuffle_v4i64_2200:
457 ; AVX1: # BB#0:
458 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
459 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
460 ; AVX1-NEXT: retq
461 ;
462 ; AVX2-LABEL: shuffle_v4i64_2200:
456 ; AVX1-LABEL: shuffle_v4i64_2200:
457 ; AVX1: # BB#0:
458 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
459 ; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
460 ; AVX1-NEXT: retq
461 ;
462 ; AVX2-LABEL: shuffle_v4i64_2200:
463463 ; AVX2: # BB#0:
464464 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
465465 ; AVX2-NEXT: retq
499499 ret <4 x i64> %shuffle
500500 }
501501
502 define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
503 ; AVX1-LABEL: shuffle_v4i64_0124:
504 ; AVX1: # BB#0:
505 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
506 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
507 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
508 ; AVX1-NEXT: retq
502 define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
503 ; AVX1-LABEL: shuffle_v4i64_0124:
504 ; AVX1: # BB#0:
505 ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
506 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
507 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
508 ; AVX1-NEXT: retq
509509 ;
510510 ; AVX2-LABEL: shuffle_v4i64_0124:
511511 ; AVX2: # BB#0:
537537 define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
538538 ; AVX1-LABEL: shuffle_v4i64_0412:
539539 ; AVX1: # BB#0:
540 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
541 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
542 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
543 ; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
544 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
545 ; AVX1-NEXT: retq
546 ;
540 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
541 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
542 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
543 ; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
544 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
545 ; AVX1-NEXT: retq
546 ;
547547 ; AVX2-LABEL: shuffle_v4i64_0412:
548548 ; AVX2: # BB#0:
549549 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
556556
557557 define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
558558 ; AVX1-LABEL: shuffle_v4i64_4012:
559 ; AVX1: # BB#0:
560 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
561 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
562 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
563 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
564 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
565 ; AVX1-NEXT: retq
559 ; AVX1: # BB#0:
560 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
561 ; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
562 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
563 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
564 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
565 ; AVX1-NEXT: retq
566566 ;
567567 ; AVX2-LABEL: shuffle_v4i64_4012:
568568 ; AVX2: # BB#0:
793793 define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
794794 ; AVX1-LABEL: insert_mem_and_zero_v4i64:
795795 ; AVX1: # BB#0:
796 ; AVX1-NEXT: vmovq (%rdi), %xmm0
796 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
797797 ; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
798798 ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
799799 ; AVX1-NEXT: retq
800800 ;
801801 ; AVX2-LABEL: insert_mem_and_zero_v4i64:
802802 ; AVX2: # BB#0:
803 ; AVX2-NEXT: vmovq (%rdi), %xmm0
803 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
804804 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
805805 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
806806 ; AVX2-NEXT: retq
814814 ; ALL-LABEL: insert_reg_and_zero_v4f64:
815815 ; ALL: # BB#0:
816816 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
817 ; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0
817 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
818818 ; ALL-NEXT: retq
819819 %v = insertelement <4 x double> undef, double %a, i32 0
820820 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32>
824824 define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
825825 ; ALL-LABEL: insert_mem_and_zero_v4f64:
826826 ; ALL: # BB#0:
827 ; ALL-NEXT: vmovsd (%rdi), %xmm0
827 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
828828 ; ALL-NEXT: retq
829829 %a = load double* %ptr
830830 %v = insertelement <4 x double> undef, double %a, i32 0
871871 ret <4 x double> %3
872872 }
873873
874 define <4 x double> @splat_v4f64(<2 x double> %r) {
875 ; AVX1-LABEL: splat_v4f64:
876 ; AVX1: # BB#0:
877 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
878 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
879 ; AVX1-NEXT: retq
880 ;
874 define <4 x double> @splat_v4f64(<2 x double> %r) {
875 ; AVX1-LABEL: splat_v4f64:
876 ; AVX1: # BB#0:
877 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
878 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
879 ; AVX1-NEXT: retq
880 ;
881881 ; AVX2-LABEL: splat_v4f64:
882882 ; AVX2: # BB#0:
883883 ; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
18521852 define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
18531853 ; ALL-LABEL: concat_v2f32_1:
18541854 ; ALL: # BB#0: # %entry
1855 ; ALL-NEXT: vmovq (%rdi), %xmm0
1855 ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
18561856 ; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
18571857 ; ALL-NEXT: retq
18581858 entry:
18671867 define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) {
18681868 ; ALL-LABEL: concat_v2f32_2:
18691869 ; ALL: # BB#0: # %entry
1870 ; ALL-NEXT: vmovq (%rdi), %xmm0
1870 ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
18711871 ; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
18721872 ; ALL-NEXT: retq
18731873 entry:
18801880 define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) {
18811881 ; ALL-LABEL: concat_v2f32_3:
18821882 ; ALL: # BB#0: # %entry
1883 ; ALL-NEXT: vmovq (%rdi), %xmm0
1883 ; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
18841884 ; ALL-NEXT: vmovhpd (%rsi), %xmm0, %xmm0
18851885 ; ALL-NEXT: retq
18861886 entry: