llvm.org GIT mirror llvm / 5bafff3
Add support for ARM's Advanced SIMD (NEON) instruction set. This is still a work in progress but most of the NEON instruction set is supported. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@73919 91177308-0d34-0410-b5e6-96231b3b80d8 Bob Wilson 10 years ago
102 changed file(s) with 10310 addition(s) and 135 deletion(s). Raw diff Collapse all Expand all
115115 def llvm_v2i32_ty : LLVMType; // 2 x i32
116116 def llvm_v1i64_ty : LLVMType; // 1 x i64
117117 def llvm_v4i32_ty : LLVMType; // 4 x i32
118 def llvm_v2f32_ty : LLVMType; // 2 x float
118119 def llvm_v4f32_ty : LLVMType; // 4 x float
119120 def llvm_v2f64_ty : LLVMType; // 2 x double
120121
1818 def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
1919 Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
2020 }
21
22 //===----------------------------------------------------------------------===//
23 // Advanced SIMD (NEON)
24
25 let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.".
26
27 // The following classes do not correspond directly to GCC builtins.
28 class Neon_1Arg_Intrinsic
29 : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
30 class Neon_1Arg_Float_Intrinsic
31 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
32 class Neon_1Arg_Narrow_Intrinsic
33 : Intrinsic<[llvm_anyint_ty],
34 [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
35 class Neon_1Arg_Long_Intrinsic
36 : Intrinsic<[llvm_anyint_ty],
37 [LLVMTruncatedElementVectorType<0>], [IntrNoMem]>;
38 class Neon_2Arg_Intrinsic
39 : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
40 [IntrNoMem]>;
41 class Neon_2Arg_Float_Intrinsic
42 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
43 [IntrNoMem]>;
44 class Neon_2Arg_Narrow_Intrinsic
45 : Intrinsic<[llvm_anyint_ty],
46 [LLVMExtendedElementVectorType<0>,
47 LLVMExtendedElementVectorType<0>],
48 [IntrNoMem]>;
49 class Neon_2Arg_Long_Intrinsic
50 : Intrinsic<[llvm_anyint_ty],
51 [LLVMTruncatedElementVectorType<0>,
52 LLVMTruncatedElementVectorType<0>],
53 [IntrNoMem]>;
54 class Neon_2Arg_Wide_Intrinsic
55 : Intrinsic<[llvm_anyint_ty],
56 [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>],
57 [IntrNoMem]>;
58 class Neon_3Arg_Intrinsic
59 : Intrinsic<[llvm_anyint_ty],
60 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
61 [IntrNoMem]>;
62 class Neon_3Arg_Long_Intrinsic
63 : Intrinsic<[llvm_anyint_ty],
64 [LLVMMatchType<0>,
65 LLVMTruncatedElementVectorType<0>,
66 LLVMTruncatedElementVectorType<0>],
67 [IntrNoMem]>;
68 class Neon_CvtFxToFP_Intrinsic
69 : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
70 class Neon_CvtFPToFx_Intrinsic
71 : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
72 }
73
74 // Arithmetic ops
75
76 let Properties = [IntrNoMem, Commutative] in {
77
78 // Vector Add.
79 def int_arm_neon_vhadds : Neon_2Arg_Intrinsic;
80 def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic;
81 def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic;
82 def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic;
83 def int_arm_neon_vqadds : Neon_2Arg_Intrinsic;
84 def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
85 def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic;
86 def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
87 def int_arm_neon_vaddls : Neon_2Arg_Long_Intrinsic;
88 def int_arm_neon_vaddlu : Neon_2Arg_Long_Intrinsic;
89 def int_arm_neon_vaddws : Neon_2Arg_Wide_Intrinsic;
90 def int_arm_neon_vaddwu : Neon_2Arg_Wide_Intrinsic;
91
92 // Vector Multiply.
93 def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
94 def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;
95 def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;
96 def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;
97 def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;
98 def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
99 def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
100
101 // Vector Multiply and Accumulate/Subtract.
102 def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic;
103 def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic;
104 def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic;
105 def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic;
106 def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
107 def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
108
109 // Vector Maximum.
110 def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic;
111 def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic;
112 def int_arm_neon_vmaxf : Neon_2Arg_Float_Intrinsic;
113
114 // Vector Minimum.
115 def int_arm_neon_vmins : Neon_2Arg_Intrinsic;
116 def int_arm_neon_vminu : Neon_2Arg_Intrinsic;
117 def int_arm_neon_vminf : Neon_2Arg_Float_Intrinsic;
118
119 // Vector Reciprocal Step.
120 def int_arm_neon_vrecps : Neon_2Arg_Float_Intrinsic;
121
122 // Vector Reciprocal Square Root Step.
123 def int_arm_neon_vrsqrts : Neon_2Arg_Float_Intrinsic;
124 }
125
126 // Vector Subtract.
127 def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic;
128 def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic;
129 def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
130 def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
131 def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
132 def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
133 def int_arm_neon_vsubls : Neon_2Arg_Long_Intrinsic;
134 def int_arm_neon_vsublu : Neon_2Arg_Long_Intrinsic;
135 def int_arm_neon_vsubws : Neon_2Arg_Wide_Intrinsic;
136 def int_arm_neon_vsubwu : Neon_2Arg_Wide_Intrinsic;
137
138 // Vector Absolute Compare.
139 let TargetPrefix = "arm" in {
140 def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
141 [llvm_v2f32_ty, llvm_v2f32_ty],
142 [IntrNoMem]>;
143 def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
144 [llvm_v4f32_ty, llvm_v4f32_ty],
145 [IntrNoMem]>;
146 def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
147 [llvm_v2f32_ty, llvm_v2f32_ty],
148 [IntrNoMem]>;
149 def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
150 [llvm_v4f32_ty, llvm_v4f32_ty],
151 [IntrNoMem]>;
152 }
153
154 // Vector Absolute Differences.
155 def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
156 def int_arm_neon_vabdu : Neon_2Arg_Intrinsic;
157 def int_arm_neon_vabdf : Neon_2Arg_Float_Intrinsic;
158 def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic;
159 def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic;
160
161 // Vector Absolute Difference and Accumulate.
162 def int_arm_neon_vabas : Neon_3Arg_Intrinsic;
163 def int_arm_neon_vabau : Neon_3Arg_Intrinsic;
164 def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic;
165 def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic;
166
167 // Vector Pairwise Add.
168 def int_arm_neon_vpaddi : Neon_2Arg_Intrinsic;
169 def int_arm_neon_vpaddf : Neon_2Arg_Float_Intrinsic;
170
171 // Vector Pairwise Add Long.
172 // Note: This is different than the other "long" NEON intrinsics because
173 // the result vector has half as many elements as the source vector.
174 // The source and destination vector types must be specified separately.
175 let TargetPrefix = "arm" in {
176 def int_arm_neon_vpaddls : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
177 [IntrNoMem]>;
178 def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
179 [IntrNoMem]>;
180 }
181
182 // Vector Pairwise Add and Accumulate Long.
183 // Note: This is similar to vpaddl but the destination vector also appears
184 // as the first argument.
185 let TargetPrefix = "arm" in {
186 def int_arm_neon_vpadals : Intrinsic<[llvm_anyint_ty],
187 [LLVMMatchType<0>, llvm_anyint_ty],
188 [IntrNoMem]>;
189 def int_arm_neon_vpadalu : Intrinsic<[llvm_anyint_ty],
190 [LLVMMatchType<0>, llvm_anyint_ty],
191 [IntrNoMem]>;
192 }
193
194 // Vector Pairwise Maximum and Minimum.
195 def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic;
196 def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic;
197 def int_arm_neon_vpmaxf : Neon_2Arg_Float_Intrinsic;
198 def int_arm_neon_vpmins : Neon_2Arg_Intrinsic;
199 def int_arm_neon_vpminu : Neon_2Arg_Intrinsic;
200 def int_arm_neon_vpminf : Neon_2Arg_Float_Intrinsic;
201
202 // Vector Shifts:
203 //
204 // The various saturating and rounding vector shift operations need to be
205 // represented by intrinsics in LLVM, and even the basic VSHL variable shift
206 // operation cannot be safely translated to LLVM's shift operators. VSHL can
207 // be used for both left and right shifts, or even combinations of the two,
208 // depending on the signs of the shift amounts. It also has well-defined
209 // behavior for shift amounts that LLVM leaves undefined. Only basic shifts
210 // by constants can be represented with LLVM's shift operators.
211 //
212 // The shift counts for these intrinsics are always vectors, even for constant
213 // shifts, where the constant is replicated. For consistency with VSHL (and
214 // other variable shift instructions), left shifts have positive shift counts
215 // and right shifts have negative shift counts. This convention is also used
216 // for constant right shift intrinsics, and to help preserve sanity, the
217 // intrinsic names use "shift" instead of either "shl" or "shr". Where
218 // applicable, signed and unsigned versions of the intrinsics are
219 // distinguished with "s" and "u" suffixes. A few NEON shift instructions,
220 // such as VQSHLU, take signed operands but produce unsigned results; these
221 // use a "su" suffix.
222
223 // Vector Shift.
224 def int_arm_neon_vshifts : Neon_2Arg_Intrinsic;
225 def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic;
226 def int_arm_neon_vshiftls : Neon_2Arg_Long_Intrinsic;
227 def int_arm_neon_vshiftlu : Neon_2Arg_Long_Intrinsic;
228 def int_arm_neon_vshiftn : Neon_2Arg_Narrow_Intrinsic;
229
230 // Vector Rounding Shift.
231 def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic;
232 def int_arm_neon_vrshiftu : Neon_2Arg_Intrinsic;
233 def int_arm_neon_vrshiftn : Neon_2Arg_Narrow_Intrinsic;
234
235 // Vector Saturating Shift.
236 def int_arm_neon_vqshifts : Neon_2Arg_Intrinsic;
237 def int_arm_neon_vqshiftu : Neon_2Arg_Intrinsic;
238 def int_arm_neon_vqshiftsu : Neon_2Arg_Intrinsic;
239 def int_arm_neon_vqshiftns : Neon_2Arg_Narrow_Intrinsic;
240 def int_arm_neon_vqshiftnu : Neon_2Arg_Narrow_Intrinsic;
241 def int_arm_neon_vqshiftnsu : Neon_2Arg_Narrow_Intrinsic;
242
243 // Vector Saturating Rounding Shift.
244 def int_arm_neon_vqrshifts : Neon_2Arg_Intrinsic;
245 def int_arm_neon_vqrshiftu : Neon_2Arg_Intrinsic;
246 def int_arm_neon_vqrshiftns : Neon_2Arg_Narrow_Intrinsic;
247 def int_arm_neon_vqrshiftnu : Neon_2Arg_Narrow_Intrinsic;
248 def int_arm_neon_vqrshiftnsu : Neon_2Arg_Narrow_Intrinsic;
249
250 // Vector Shift and Insert.
251 def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic;
252
253 // Vector Absolute Value and Saturating Absolute Value.
254 def int_arm_neon_vabs : Neon_1Arg_Intrinsic;
255 def int_arm_neon_vabsf : Neon_1Arg_Float_Intrinsic;
256 def int_arm_neon_vqabs : Neon_1Arg_Intrinsic;
257
258 // Vector Saturating Negate.
259 def int_arm_neon_vqneg : Neon_1Arg_Intrinsic;
260
261 // Vector Count Leading Sign/Zero Bits.
262 def int_arm_neon_vcls : Neon_1Arg_Intrinsic;
263 def int_arm_neon_vclz : Neon_1Arg_Intrinsic;
264
265 // Vector Count One Bits.
266 def int_arm_neon_vcnt : Neon_1Arg_Intrinsic;
267
268 // Vector Reciprocal Estimate.
269 def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic;
270 def int_arm_neon_vrecpef : Neon_1Arg_Float_Intrinsic;
271
272 // Vector Reciprocal Square Root Estimate.
273 def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic;
274 def int_arm_neon_vrsqrtef : Neon_1Arg_Float_Intrinsic;
275
276 // Vector Conversions Between Floating-point and Fixed-point.
277 def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic;
278 def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic;
279 def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic;
280 def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic;
281
282 // Narrowing and Lengthening Vector Moves.
283 def int_arm_neon_vmovn : Neon_1Arg_Narrow_Intrinsic;
284 def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;
285 def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;
286 def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;
287 def int_arm_neon_vmovls : Neon_1Arg_Long_Intrinsic;
288 def int_arm_neon_vmovlu : Neon_1Arg_Long_Intrinsic;
289
290 let TargetPrefix = "arm" in {
291
292 // De-interleaving vector loads from N-element structures.
293 def int_arm_neon_vld3i : Intrinsic<[llvm_anyint_ty],
294 [llvm_ptr_ty], [IntrReadArgMem]>;
295 def int_arm_neon_vld3f : Intrinsic<[llvm_anyfloat_ty],
296 [llvm_ptr_ty], [IntrReadArgMem]>;
297 def int_arm_neon_vld4i : Intrinsic<[llvm_anyint_ty],
298 [llvm_ptr_ty], [IntrReadArgMem]>;
299 def int_arm_neon_vld4f : Intrinsic<[llvm_anyfloat_ty],
300 [llvm_ptr_ty], [IntrReadArgMem]>;
301
302 // Interleaving vector stores from N-element structures.
303 def int_arm_neon_vst3i : Intrinsic<[llvm_void_ty],
304 [llvm_anyint_ty, llvm_ptr_ty],
305 [IntrWriteArgMem]>;
306 def int_arm_neon_vst3f : Intrinsic<[llvm_void_ty],
307 [llvm_anyfloat_ty, llvm_ptr_ty],
308 [IntrWriteArgMem]>;
309 def int_arm_neon_vst4i : Intrinsic<[llvm_void_ty],
310 [llvm_anyint_ty, llvm_ptr_ty],
311 [IntrWriteArgMem]>;
312 def int_arm_neon_vst4f : Intrinsic<[llvm_void_ty],
313 [llvm_anyfloat_ty, llvm_ptr_ty],
314 [IntrWriteArgMem]>;
315 }
2323
2424 CCIfType<[i8, i16], CCPromoteToType>,
2525
26 // f64 is passed in pairs of GPRs, possibly split onto the stack
27 CCIfType<[f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
26 // Handle all vector types as either f64 or v2f64.
27 CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>,
28 CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>,
29
30 // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
31 CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
2832
2933 CCIfType<[f32], CCBitConvertToType>,
3034 CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
3135
3236 CCIfType<[i32], CCAssignToStack<4, 4>>,
33 CCIfType<[f64], CCAssignToStack<8, 4>>
37 CCIfType<[f64], CCAssignToStack<8, 4>>,
38 CCIfType<[v2f64], CCAssignToStack<16, 4>>
3439 ]>;
3540
3641 def RetCC_ARM_APCS : CallingConv<[
3742 CCIfType<[f32], CCBitConvertToType>,
38 CCIfType<[f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
43
44 // Handle all vector types as either f64 or v2f64.
45 CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>,
46 CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>,
47
48 CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
3949
4050 CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
4151 CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
5868 CCAssignToReg<[R0, R1, R2, R3]>>>,
5969
6070 CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
61 CCIfType<[f64], CCAssignToStack<8, 8>>
71 CCIfType<[f64], CCAssignToStack<8, 8>>,
72 CCIfType<[v2f64], CCAssignToStack<16, 8>>
6273 ]>;
6374
6475 def RetCC_ARM_AAPCS_Common : CallingConv<[
7182 //===----------------------------------------------------------------------===//
7283
7384 def CC_ARM_AAPCS : CallingConv<[
74 CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
85 // Handle all vector types as either f64 or v2f64.
86 CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>,
87 CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>,
88
89 CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
7590 CCIfType<[f32], CCBitConvertToType>,
7691 CCDelegateTo
7792 ]>;
7893
7994 def RetCC_ARM_AAPCS : CallingConv<[
80 CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
95 // Handle all vector types as either f64 or v2f64.
96 CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>,
97 CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>,
98
99 CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
81100 CCIfType<[f32], CCBitConvertToType>,
82101 CCDelegateTo
83102 ]>;
87106 //===----------------------------------------------------------------------===//
88107
89108 def CC_ARM_AAPCS_VFP : CallingConv<[
109 // Handle all vector types as either f64 or v2f64.
110 CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>,
111 CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>,
112
90113 CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
91114 CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
92115 S9, S10, S11, S12, S13, S14, S15]>>,
94117 ]>;
95118
96119 def RetCC_ARM_AAPCS_VFP : CallingConv<[
120 // Handle all vector types as either f64 or v2f64.
121 CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>,
122 CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>,
123
97124 CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
98125 CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
99126 S9, S10, S11, S12, S13, S14, S15]>>,
3030 #include "llvm/Support/Compiler.h"
3131 #include "llvm/Support/Debug.h"
3232 using namespace llvm;
33
34 static const unsigned arm_dsubreg_0 = 5;
35 static const unsigned arm_dsubreg_1 = 6;
3336
3437 //===--------------------------------------------------------------------===//
3538 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
917920 return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
918921 MVT::Other, Ops, 3);
919922 }
923
924 case ISD::CONCAT_VECTORS: {
925 MVT VT = Op.getValueType();
926 assert(VT.is128BitVector() && Op.getNumOperands() == 2 &&
927 "unexpected CONCAT_VECTORS");
928 SDValue N0 = Op.getOperand(0);
929 SDValue N1 = Op.getOperand(1);
930 SDNode *Result =
931 CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT);
932 if (N0.getOpcode() != ISD::UNDEF)
933 Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
934 SDValue(Result, 0), N0,
935 CurDAG->getTargetConstant(arm_dsubreg_0,
936 MVT::i32));
937 if (N1.getOpcode() != ISD::UNDEF)
938 Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
939 SDValue(Result, 0), N1,
940 CurDAG->getTargetConstant(arm_dsubreg_1,
941 MVT::i32));
942 return Result;
943 }
944
945 case ISD::VECTOR_SHUFFLE: {
946 MVT VT = Op.getValueType();
947
948 // Match 128-bit splat to VDUPLANEQ. (This could be done with a Pat in
949 // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be
950 // transformed first into a lane number and then to both a subregister
951 // index and an adjusted lane number.) If the source operand is a
952 // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP.
953 ShuffleVectorSDNode *SVOp = cast(N);
954 if (VT.is128BitVector() && SVOp->isSplat() &&
955 Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR &&
956 Op.getOperand(1).getOpcode() == ISD::UNDEF) {
957 unsigned LaneVal = SVOp->getSplatIndex();
958
959 MVT HalfVT;
960 unsigned Opc = 0;
961 switch (VT.getVectorElementType().getSimpleVT()) {
962 default: assert(false && "unhandled VDUP splat type");
963 case MVT::i8: Opc = ARM::VDUPLN8q; HalfVT = MVT::v8i8; break;
964 case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break;
965 case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break;
966 case MVT::f32: Opc = ARM::VDUPLNfq; HalfVT = MVT::v2f32; break;
967 }
968
969 // The source operand needs to be changed to a subreg of the original
970 // 128-bit operand, and the lane number needs to be adjusted accordingly.
971 unsigned NumElts = VT.getVectorNumElements() / 2;
972 unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1);
973 SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32);
974 SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32);
975 SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG,
976 dl, HalfVT, N->getOperand(0), SR);
977 return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane);
978 }
979
980 break;
981 }
920982 }
921983
922984 return SelectCode(Op);
5555 ISD::ArgFlagsTy &ArgFlags,
5656 CCState &State);
5757
58 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
59 MVT PromotedBitwiseVT) {
60 if (VT != PromotedLdStVT) {
61 setOperationAction(ISD::LOAD, VT, Promote);
62 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
63
64 setOperationAction(ISD::STORE, VT, Promote);
65 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
66 }
67
68 MVT ElemTy = VT.getVectorElementType();
69 if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
70 setOperationAction(ISD::VSETCC, VT, Custom);
71 if (ElemTy == MVT::i8 || ElemTy == MVT::i16)
72 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
73 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
74 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
75 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
76 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
77 if (VT.isInteger()) {
78 setOperationAction(ISD::SHL, VT, Custom);
79 setOperationAction(ISD::SRA, VT, Custom);
80 setOperationAction(ISD::SRL, VT, Custom);
81 }
82
83 // Promote all bit-wise operations.
84 if (VT.isInteger() && VT != PromotedBitwiseVT) {
85 setOperationAction(ISD::AND, VT, Promote);
86 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
87 setOperationAction(ISD::OR, VT, Promote);
88 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT);
89 setOperationAction(ISD::XOR, VT, Promote);
90 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
91 }
92 }
93
94 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
95 addRegisterClass(VT, ARM::DPRRegisterClass);
96 addTypeForNEON(VT, MVT::f64, MVT::v2i32);
97 }
98
99 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
100 addRegisterClass(VT, ARM::QPRRegisterClass);
101 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
102 }
103
58104 ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
59105 : TargetLowering(TM), ARMPCLabelIndex(0) {
60106 Subtarget = &TM.getSubtarget();
151197
152198 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
153199 }
200
201 if (Subtarget->hasNEON()) {
202 addDRTypeForNEON(MVT::v2f32);
203 addDRTypeForNEON(MVT::v8i8);
204 addDRTypeForNEON(MVT::v4i16);
205 addDRTypeForNEON(MVT::v2i32);
206 addDRTypeForNEON(MVT::v1i64);
207
208 addQRTypeForNEON(MVT::v4f32);
209 addQRTypeForNEON(MVT::v2f64);
210 addQRTypeForNEON(MVT::v16i8);
211 addQRTypeForNEON(MVT::v8i16);
212 addQRTypeForNEON(MVT::v4i32);
213 addQRTypeForNEON(MVT::v2i64);
214
215 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
216 setTargetDAGCombine(ISD::SHL);
217 setTargetDAGCombine(ISD::SRL);
218 setTargetDAGCombine(ISD::SRA);
219 setTargetDAGCombine(ISD::SIGN_EXTEND);
220 setTargetDAGCombine(ISD::ZERO_EXTEND);
221 setTargetDAGCombine(ISD::ANY_EXTEND);
222 }
223
154224 computeRegisterProperties();
155225
156226 // ARM does not have f32 extending load.
351421 case ARMISD::FMDRR: return "ARMISD::FMDRR";
352422
353423 case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
424
425 case ARMISD::VCEQ: return "ARMISD::VCEQ";
426 case ARMISD::VCGE: return "ARMISD::VCGE";
427 case ARMISD::VCGEU: return "ARMISD::VCGEU";
428 case ARMISD::VCGT: return "ARMISD::VCGT";
429 case ARMISD::VCGTU: return "ARMISD::VCGTU";
430 case ARMISD::VTST: return "ARMISD::VTST";
431
432 case ARMISD::VSHL: return "ARMISD::VSHL";
433 case ARMISD::VSHRs: return "ARMISD::VSHRs";
434 case ARMISD::VSHRu: return "ARMISD::VSHRu";
435 case ARMISD::VSHLLs: return "ARMISD::VSHLLs";
436 case ARMISD::VSHLLu: return "ARMISD::VSHLLu";
437 case ARMISD::VSHLLi: return "ARMISD::VSHLLi";
438 case ARMISD::VSHRN: return "ARMISD::VSHRN";
439 case ARMISD::VRSHRs: return "ARMISD::VRSHRs";
440 case ARMISD::VRSHRu: return "ARMISD::VRSHRu";
441 case ARMISD::VRSHRN: return "ARMISD::VRSHRN";
442 case ARMISD::VQSHLs: return "ARMISD::VQSHLs";
443 case ARMISD::VQSHLu: return "ARMISD::VQSHLu";
444 case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu";
445 case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs";
446 case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu";
447 case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu";
448 case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs";
449 case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu";
450 case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu";
451 case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu";
452 case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs";
453 case ARMISD::VDUPLANEQ: return "ARMISD::VDUPLANEQ";
354454 }
355455 }
356456
422522 #include "ARMGenCallingConv.inc"
423523
424524 // APCS f64 is in register pairs, possibly split to stack
525 static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
526 CCValAssign::LocInfo &LocInfo,
527 CCState &State, bool CanFail) {
528 static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
529
530 // Try to get the first register.
531 if (unsigned Reg = State.AllocateReg(RegList, 4))
532 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
533 else {
534 // For the 2nd half of a v2f64, do not fail.
535 if (CanFail)
536 return false;
537
538 // Put the whole thing on the stack.
539 State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
540 State.AllocateStack(8, 4),
541 LocVT, LocInfo));
542 return true;
543 }
544
545 // Try to get the second register.
546 if (unsigned Reg = State.AllocateReg(RegList, 4))
547 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
548 else
549 State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
550 State.AllocateStack(4, 4),
551 LocVT, LocInfo));
552 return true;
553 }
554
425555 static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
426556 CCValAssign::LocInfo &LocInfo,
427557 ISD::ArgFlagsTy &ArgFlags,
428558 CCState &State) {
429 static const unsigned HiRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
430 static const unsigned LoRegList[] = { ARM::R1,
431 ARM::R2,
432 ARM::R3,
433 ARM::NoRegister };
434
435 unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 4);
436 if (Reg == 0)
437 return false; // we didn't handle it
559 if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
560 return false;
561 if (LocVT == MVT::v2f64 &&
562 !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
563 return false;
564 return true; // we handled it
565 }
566
567 // AAPCS f64 is in aligned register pairs
568 static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
569 CCValAssign::LocInfo &LocInfo,
570 CCState &State, bool CanFail) {
571 static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
572 static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
573
574 unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
575 if (Reg == 0) {
576 // For the 2nd half of a v2f64, do not just fail.
577 if (CanFail)
578 return false;
579
580 // Put the whole thing on the stack.
581 State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
582 State.AllocateStack(8, 8),
583 LocVT, LocInfo));
584 return true;
585 }
438586
439587 unsigned i;
440 for (i = 0; i < 4; ++i)
588 for (i = 0; i < 2; ++i)
441589 if (HiRegList[i] == Reg)
442590 break;
443591
444 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
445 if (LoRegList[i] != ARM::NoRegister)
446 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
447 MVT::i32, LocInfo));
448 else
449 State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
450 State.AllocateStack(4, 4),
451 MVT::i32, LocInfo));
452 return true; // we handled it
453 }
454
455 // AAPCS f64 is in aligned register pairs
592 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
593 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
594 LocVT, LocInfo));
595 return true;
596 }
597
456598 static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
457599 CCValAssign::LocInfo &LocInfo,
458600 ISD::ArgFlagsTy &ArgFlags,
459601 CCState &State) {
602 if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
603 return false;
604 if (LocVT == MVT::v2f64 &&
605 !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
606 return false;
607 return true; // we handled it
608 }
609
610 static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
611 CCValAssign::LocInfo &LocInfo, CCState &State) {
460612 static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
461613 static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
462614
469621 if (HiRegList[i] == Reg)
470622 break;
471623
472 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
624 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
473625 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
474 MVT::i32, LocInfo));
475 return true; // we handled it
626 LocVT, LocInfo));
627 return true;
476628 }
477629
478630 static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
479631 CCValAssign::LocInfo &LocInfo,
480632 ISD::ArgFlagsTy &ArgFlags,
481633 CCState &State) {
482 static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
483 static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
484
485 unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
486 if (Reg == 0)
487 return false; // we didn't handle it
488
489 unsigned i;
490 for (i = 0; i < 2; ++i)
491 if (HiRegList[i] == Reg)
492 break;
493
494 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
495 State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
496 MVT::i32, LocInfo));
634 if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
635 return false;
636 if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
637 return false;
497638 return true; // we handled it
498639 }
499640
557698
558699 SDValue Val;
559700 if (VA.needsCustom()) {
560 // Handle f64 as custom.
701 // Handle f64 or half of a v2f64.
561702 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
562703 InFlag);
563704 Chain = Lo.getValue(1);
568709 Chain = Hi.getValue(1);
569710 InFlag = Hi.getValue(2);
570711 Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi);
712
713 if (VA.getLocVT() == MVT::v2f64) {
714 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
715 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
716 DAG.getConstant(0, MVT::i32));
717
718 VA = RVLocs[++i]; // skip ahead to next loc
719 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
720 Chain = Lo.getValue(1);
721 InFlag = Lo.getValue(2);
722 VA = RVLocs[++i]; // skip ahead to next loc
723 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
724 Chain = Hi.getValue(1);
725 InFlag = Hi.getValue(2);
726 Val = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, Lo, Hi);
727 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
728 DAG.getConstant(1, MVT::i32));
729 }
571730 } else {
572731 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
573732 InFlag);
624783 PseudoSourceValue::getStack(), LocMemOffset);
625784 }
626785
786 void ARMTargetLowering::PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
787 SDValue Chain, SDValue &Arg,
788 RegsToPassVector &RegsToPass,
789 CCValAssign &VA, CCValAssign &NextVA,
790 SDValue &StackPtr,
791 SmallVector &MemOpChains,
792 ISD::ArgFlagsTy Flags) {
793 DebugLoc dl = TheCall->getDebugLoc();
794
795 SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
796 DAG.getVTList(MVT::i32, MVT::i32), Arg);
797 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
798
799 if (NextVA.isRegLoc())
800 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
801 else {
802 assert(NextVA.isMemLoc());
803 if (StackPtr.getNode() == 0)
804 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
805
806 MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, NextVA,
807 Chain, fmrrd.getValue(1), Flags));
808 }
809 }
810
627811 /// LowerCALL - Lowering a ISD::CALL node into a callseq_start <-
628812 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
629813 /// nodes.
650834
651835 SDValue StackPtr = DAG.getRegister(ARM::SP, MVT::i32);
652836
653 SmallVector, 8> RegsToPass;
837 RegsToPassVector RegsToPass;
654838 SmallVector MemOpChains;
655839
656840 // Walk the register/memloc assignments, inserting copies/loads. In the case
680864 break;
681865 }
682866
683 // f64 is passed in i32 pairs and must be combined
867 // f64 and v2f64 are passed in i32 pairs and must be split into pieces
684868 if (VA.needsCustom()) {
685 SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
686 DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
687 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
688 VA = ArgLocs[++i]; // skip ahead to next loc
689 if (VA.isRegLoc())
690 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(1)));
691 else {
692 assert(VA.isMemLoc());
693 if (StackPtr.getNode() == 0)
694 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
695
696 MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
697 Chain, fmrrd.getValue(1),
698 Flags));
869 if (VA.getLocVT() == MVT::v2f64) {
870 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
871 DAG.getConstant(0, MVT::i32));
872 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
873 DAG.getConstant(1, MVT::i32));
874
875 PassF64ArgInRegs(TheCall, DAG, Chain, Op0, RegsToPass,
876 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
877
878 VA = ArgLocs[++i]; // skip ahead to next loc
879 if (VA.isRegLoc()) {
880 PassF64ArgInRegs(TheCall, DAG, Chain, Op1, RegsToPass,
881 VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
882 } else {
883 assert(VA.isMemLoc());
884 if (StackPtr.getNode() == 0)
885 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
886
887 MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
888 Chain, Op1, Flags));
889 }
890 } else {
891 PassF64ArgInRegs(TheCall, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
892 StackPtr, MemOpChains, Flags);
699893 }
700894 } else if (VA.isRegLoc()) {
701895 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
8631057 break;
8641058 }
8651059
866 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
867 // available.
8681060 if (VA.needsCustom()) {
1061 if (VA.getLocVT() == MVT::v2f64) {
1062 // Extract the first half and return it in two registers.
1063 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1064 DAG.getConstant(0, MVT::i32));
1065 SDValue HalfGPRs = DAG.getNode(ARMISD::FMRRD, dl,
1066 DAG.getVTList(MVT::i32, MVT::i32), Half);
1067
1068 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
1069 Flag = Chain.getValue(1);
1070 VA = RVLocs[++i]; // skip ahead to next loc
1071 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
1072 HalfGPRs.getValue(1), Flag);
1073 Flag = Chain.getValue(1);
1074 VA = RVLocs[++i]; // skip ahead to next loc
1075
1076 // Extract the 2nd half and fall through to handle it as an f64 value.
1077 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
1078 DAG.getConstant(1, MVT::i32));
1079 }
1080 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
1081 // available.
8691082 SDValue fmrrd = DAG.getNode(ARMISD::FMRRD, dl,
8701083 DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
8711084 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
11161329 }
11171330
11181331 SDValue
1332 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
1333 SDValue &Root, SelectionDAG &DAG,
1334 DebugLoc dl) {
1335 MachineFunction &MF = DAG.getMachineFunction();
1336 ARMFunctionInfo *AFI = MF.getInfo();
1337
1338 TargetRegisterClass *RC;
1339 if (AFI->isThumbFunction())
1340 RC = ARM::tGPRRegisterClass;
1341 else
1342 RC = ARM::GPRRegisterClass;
1343
1344 // Transform the arguments stored in physical registers into virtual ones.
1345 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1346 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
1347
1348 SDValue ArgValue2;
1349 if (NextVA.isMemLoc()) {
1350 unsigned ArgSize = NextVA.getLocVT().getSizeInBits()/8;
1351 MachineFrameInfo *MFI = MF.getFrameInfo();
1352 int FI = MFI->CreateFixedObject(ArgSize, NextVA.getLocMemOffset());
1353
1354 // Create load node to retrieve arguments from the stack.
1355 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1356 ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0);
1357 } else {
1358 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
1359 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
1360 }
1361
1362 return DAG.getNode(ARMISD::FMDRR, dl, MVT::f64, ArgValue, ArgValue2);
1363 }
1364
1365 SDValue
11191366 ARMTargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
11201367 MachineFunction &MF = DAG.getMachineFunction();
11211368 MachineFrameInfo *MFI = MF.getFrameInfo();
11401387 // Arguments stored in registers.
11411388 if (VA.isRegLoc()) {
11421389 MVT RegVT = VA.getLocVT();
1143 TargetRegisterClass *RC;
1144 if (AFI->isThumbFunction())
1145 RC = ARM::tGPRRegisterClass;
1146 else
1147 RC = ARM::GPRRegisterClass;
1148
1149 if (FloatABIType == FloatABI::Hard) {
1150 if (RegVT == MVT::f32)
1390
1391 SDValue ArgValue;
1392 if (VA.needsCustom()) {
1393 // f64 and vector types are split up into multiple registers or
1394 // combinations of registers and stack slots.
1395 RegVT = MVT::i32;
1396
1397 if (VA.getLocVT() == MVT::v2f64) {
1398 SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
1399 Root, DAG, dl);
1400 VA = ArgLocs[++i]; // skip ahead to next loc
1401 SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
1402 Root, DAG, dl);
1403 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1404 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
1405 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
1406 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
1407 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
1408 } else
1409 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Root, DAG, dl);
1410
1411 } else {
1412 TargetRegisterClass *RC;
1413 if (FloatABIType == FloatABI::Hard && RegVT == MVT::f32)
11511414 RC = ARM::SPRRegisterClass;
1152 else if (RegVT == MVT::f64)
1415 else if (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)
11531416 RC = ARM::DPRRegisterClass;
1154 } else if (RegVT == MVT::f64) {
1155 // f64 is passed in pairs of GPRs and must be combined.
1156 RegVT = MVT::i32;
1157 } else if (!((RegVT == MVT::i32) || (RegVT == MVT::f32)))
1158 assert(0 && "RegVT not supported by FORMAL_ARGUMENTS Lowering");
1159
1160 // Transform the arguments stored in physical registers into virtual ones.
1161 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1162 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
1163
1164 // f64 is passed in i32 pairs and must be combined.
1165 if (VA.needsCustom()) {
1166 SDValue ArgValue2;
1167
1168 VA = ArgLocs[++i]; // skip ahead to next loc
1169 if (VA.isMemLoc()) {
1170 // must be APCS to split like this
1171 unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
1172 int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset());
1173
1174 // Create load node to retrieve arguments from the stack.
1175 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1176 ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, NULL, 0);
1177 } else {
1178 Reg = MF.addLiveIn(VA.getLocReg(), RC);
1179 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
1180 }
1181
1182 ArgValue = DAG.getNode(ARMISD::FMDRR, dl, MVT::f64,
1183 ArgValue, ArgValue2);
1417 else if (AFI->isThumbFunction())
1418 RC = ARM::tGPRRegisterClass;
1419 else
1420 RC = ARM::GPRRegisterClass;
1421
1422 assert((RegVT == MVT::i32 || RegVT == MVT::f32 ||
1423 (FloatABIType == FloatABI::Hard && RegVT == MVT::f64)) &&
1424 "RegVT not supported by FORMAL_ARGUMENTS Lowering");
1425
1426 // Transform the arguments in physical registers into virtual ones.
1427 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1428 ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT);
11841429 }
11851430
11861431 // If this is an 8 or 16-bit value, it is really passed promoted
16371882 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
16381883 }
16391884
1640 static SDValue ExpandSRx(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) {
1641 assert(N->getValueType(0) == MVT::i64 &&
1885 /// getZeroVector - Returns a vector of specified type with all zero elements.
1886 ///
1887 static SDValue getZeroVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
1888 assert(VT.isVector() && "Expected a vector type");
1889
1890 // Zero vectors are used to represent vector negation and in those cases
1891 // will be implemented with the NEON VNEG instruction. However, VNEG does
1892 // not support i64 elements, so sometimes the zero vectors will need to be
1893 // explicitly constructed. For those cases, and potentially other uses in
1894 // the future, always build zero vectors as <4 x i32> or <2 x i32> bitcasted
1895 // to their dest type. This ensures they get CSE'd.
1896 SDValue Vec;
1897 SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
1898 if (VT.getSizeInBits() == 64)
1899 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
1900 else
1901 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
1902
1903 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
1904 }
1905
1906 /// getOnesVector - Returns a vector of specified type with all bits set.
1907 ///
1908 static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) {
1909 assert(VT.isVector() && "Expected a vector type");
1910
1911 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
1912 // type. This ensures they get CSE'd.
1913 SDValue Vec;
1914 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
1915 if (VT.getSizeInBits() == 64)
1916 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
1917 else
1918 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
1919
1920 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
1921 }
1922
1923 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
1924 const ARMSubtarget *ST) {
1925 MVT VT = N->getValueType(0);
1926 DebugLoc dl = N->getDebugLoc();
1927
1928 // Lower vector shifts on NEON to use VSHL.
1929 if (VT.isVector()) {
1930 assert(ST->hasNEON() && "unexpected vector shift");
1931
1932 // Left shifts translate directly to the vshiftu intrinsic.
1933 if (N->getOpcode() == ISD::SHL)
1934 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
1935 DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
1936 N->getOperand(0), N->getOperand(1));
1937
1938 assert((N->getOpcode() == ISD::SRA ||
1939 N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
1940
1941 // NEON uses the same intrinsics for both left and right shifts. For
1942 // right shifts, the shift amounts are negative, so negate the vector of
1943 // shift amounts.
1944 MVT ShiftVT = N->getOperand(1).getValueType();
1945 SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
1946 getZeroVector(ShiftVT, DAG, dl),
1947 N->getOperand(1));
1948 Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
1949 Intrinsic::arm_neon_vshifts :
1950 Intrinsic::arm_neon_vshiftu);
1951 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
1952 DAG.getConstant(vshiftInt, MVT::i32),
1953 N->getOperand(0), NegatedCount);
1954 }
1955
1956 assert(VT == MVT::i64 &&
16421957 (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
16431958 "Unknown shift to lower!");
16441959
16511966 if (ST->isThumb()) return SDValue();
16521967
16531968 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
1654 DebugLoc dl = N->getDebugLoc();
16551969 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
16561970 DAG.getConstant(0, MVT::i32));
16571971 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
16671981
16681982 // Merge the pieces into a single i64 value.
16691983 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
1984 }
1985
1986 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
1987 SDValue TmpOp0, TmpOp1;
1988 bool Invert = false;
1989 bool Swap = false;
1990 unsigned Opc = 0;
1991
1992 SDValue Op0 = Op.getOperand(0);
1993 SDValue Op1 = Op.getOperand(1);
1994 SDValue CC = Op.getOperand(2);
1995 MVT VT = Op.getValueType();
1996 ISD::CondCode SetCCOpcode = cast(CC)->get();
1997 DebugLoc dl = Op.getDebugLoc();
1998
1999 if (Op.getOperand(1).getValueType().isFloatingPoint()) {
2000 switch (SetCCOpcode) {
2001 default: assert(0 && "Illegal FP comparison"); break;
2002 case ISD::SETUNE:
2003 case ISD::SETNE: Invert = true; // Fallthrough
2004 case ISD::SETOEQ:
2005 case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
2006 case ISD::SETOLT:
2007 case ISD::SETLT: Swap = true; // Fallthrough
2008 case ISD::SETOGT:
2009 case ISD::SETGT: Opc = ARMISD::VCGT; break;
2010 case ISD::SETOLE:
2011 case ISD::SETLE: Swap = true; // Fallthrough
2012 case ISD::SETOGE:
2013 case ISD::SETGE: Opc = ARMISD::VCGE; break;
2014 case ISD::SETUGE: Swap = true; // Fallthrough
2015 case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
2016 case ISD::SETUGT: Swap = true; // Fallthrough
2017 case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
2018 case ISD::SETUEQ: Invert = true; // Fallthrough
2019 case ISD::SETONE:
2020 // Expand this to (OLT | OGT).
2021 TmpOp0 = Op0;
2022 TmpOp1 = Op1;
2023 Opc = ISD::OR;
2024 Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
2025 Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
2026 break;
2027 case ISD::SETUO: Invert = true; // Fallthrough
2028 case ISD::SETO:
2029 // Expand this to (OLT | OGE).
2030 TmpOp0 = Op0;
2031 TmpOp1 = Op1;
2032 Opc = ISD::OR;
2033 Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
2034 Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
2035 break;
2036 }
2037 } else {
2038 // Integer comparisons.
2039 switch (SetCCOpcode) {
2040 default: assert(0 && "Illegal integer comparison"); break;
2041 case ISD::SETNE: Invert = true;
2042 case ISD::SETEQ: Opc = ARMISD::VCEQ; break;
2043 case ISD::SETLT: Swap = true;
2044 case ISD::SETGT: Opc = ARMISD::VCGT; break;
2045 case ISD::SETLE: Swap = true;
2046 case ISD::SETGE: Opc = ARMISD::VCGE; break;
2047 case ISD::SETULT: Swap = true;
2048 case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
2049 case ISD::SETULE: Swap = true;
2050 case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
2051 }
2052
2053 // Detect VTST (Vector Test Bits) = vicmp ne (and (op0, op1), zero).
2054 if (Opc == ARMISD::VCEQ) {
2055
2056 SDValue AndOp;
2057 if (ISD::isBuildVectorAllZeros(Op1.getNode()))
2058 AndOp = Op0;
2059 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
2060 AndOp = Op1;
2061
2062 // Ignore bitconvert.
2063 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BIT_CONVERT)
2064 AndOp = AndOp.getOperand(0);
2065
2066 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
2067 Opc = ARMISD::VTST;
2068 Op0 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(0));
2069 Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(1));
2070 Invert = !Invert;
2071 }
2072 }
2073 }
2074
2075 if (Swap)
2076 std::swap(Op0, Op1);
2077
2078 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
2079
2080 if (Invert)
2081 Result = DAG.getNOT(dl, Result, VT);
2082
2083 return Result;
2084 }
2085
2086 /// isVMOVSplat - Check if the specified splat value corresponds to an immediate
2087 /// VMOV instruction, and if so, return the constant being splatted.
2088 static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef,
2089 unsigned SplatBitSize, SelectionDAG &DAG) {
2090 switch (SplatBitSize) {
2091 case 8:
2092 // Any 1-byte value is OK.
2093 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
2094 return DAG.getTargetConstant(SplatBits, MVT::i8);
2095
2096 case 16:
2097 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
2098 if ((SplatBits & ~0xff) == 0 ||
2099 (SplatBits & ~0xff00) == 0)
2100 return DAG.getTargetConstant(SplatBits, MVT::i16);
2101 break;
2102
2103 case 32:
2104 // NEON's 32-bit VMOV supports splat values where:
2105 // * only one byte is nonzero, or
2106 // * the least significant byte is 0xff and the second byte is nonzero, or
2107 // * the least significant 2 bytes are 0xff and the third is nonzero.
2108 if ((SplatBits & ~0xff) == 0 ||
2109 (SplatBits & ~0xff00) == 0 ||
2110 (SplatBits & ~0xff0000) == 0 ||
2111 (SplatBits & ~0xff000000) == 0)
2112 return DAG.getTargetConstant(SplatBits, MVT::i32);
2113
2114 if ((SplatBits & ~0xffff) == 0 &&
2115 ((SplatBits | SplatUndef) & 0xff) == 0xff)
2116 return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32);
2117
2118 if ((SplatBits & ~0xffffff) == 0 &&
2119 ((SplatBits | SplatUndef) & 0xffff) == 0xffff)
2120 return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32);
2121
2122 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
2123 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
2124 // VMOV.I32. A (very) minor optimization would be to replicate the value
2125 // and fall through here to test for a valid 64-bit splat. But, then the
2126 // caller would also need to check and handle the change in size.
2127 break;
2128
2129 case 64: {
2130 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
2131 uint64_t BitMask = 0xff;
2132 uint64_t Val = 0;
2133 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
2134 if (((SplatBits | SplatUndef) & BitMask) == BitMask)
2135 Val |= BitMask;
2136 else if ((SplatBits & BitMask) != 0)
2137 return SDValue();
2138 BitMask <<= 8;
2139 }
2140 return DAG.getTargetConstant(Val, MVT::i64);
2141 }
2142
2143 default:
2144 assert(0 && "unexpected size for isVMOVSplat");
2145 break;
2146 }
2147
2148 return SDValue();
2149 }
2150
2151 /// getVMOVImm - If this is a build_vector of constants which can be
2152 /// formed by using a VMOV instruction of the specified element size,
2153 /// return the constant being splatted. The ByteSize field indicates the
2154 /// number of bytes of each element [1248].
2155 SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
2156 BuildVectorSDNode *BVN = dyn_cast(N);
2157 APInt SplatBits, SplatUndef;
2158 unsigned SplatBitSize;
2159 bool HasAnyUndefs;
2160 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
2161 HasAnyUndefs, ByteSize * 8))
2162 return SDValue();
2163
2164 if (SplatBitSize > ByteSize * 8)
2165 return SDValue();
2166
2167 return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
2168 SplatBitSize, DAG);
2169 }
2170
2171 static SDValue BuildSplat(SDValue Val, MVT VT, SelectionDAG &DAG, DebugLoc dl) {
2172 // Canonicalize all-zeros and all-ones vectors.
2173 ConstantSDNode *ConstVal = dyn_cast(Val.getNode());
2174 if (ConstVal->isNullValue())
2175 return getZeroVector(VT, DAG, dl);
2176 if (ConstVal->isAllOnesValue())
2177 return getOnesVector(VT, DAG, dl);
2178
2179 MVT CanonicalVT;
2180 if (VT.is64BitVector()) {
2181 switch (Val.getValueType().getSizeInBits()) {
2182 case 8: CanonicalVT = MVT::v8i8; break;
2183 case 16: CanonicalVT = MVT::v4i16; break;
2184 case 32: CanonicalVT = MVT::v2i32; break;
2185 case 64: CanonicalVT = MVT::v1i64; break;
2186 default: assert(0 && "unexpected splat element type"); break;
2187 }
2188 } else {
2189 assert(VT.is128BitVector() && "unknown splat vector size");
2190 switch (Val.getValueType().getSizeInBits()) {
2191 case 8: CanonicalVT = MVT::v16i8; break;
2192 case 16: CanonicalVT = MVT::v8i16; break;
2193 case 32: CanonicalVT = MVT::v4i32; break;
2194 case 64: CanonicalVT = MVT::v2i64; break;
2195 default: assert(0 && "unexpected splat element type"); break;
2196 }
2197 }
2198
2199 // Build a canonical splat for this value.
2200 SmallVector Ops;
2201 Ops.assign(CanonicalVT.getVectorNumElements(), Val);
2202 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, &Ops[0],
2203 Ops.size());
2204 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Res);
2205 }
2206
2207 // If this is a case we can't handle, return null and let the default
2208 // expansion code take care of it.
2209 static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
2210 BuildVectorSDNode *BVN = dyn_cast(Op.getNode());
2211 assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
2212 DebugLoc dl = Op.getDebugLoc();
2213
2214 APInt SplatBits, SplatUndef;
2215 unsigned SplatBitSize;
2216 bool HasAnyUndefs;
2217 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
2218 SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
2219 SplatUndef.getZExtValue(), SplatBitSize, DAG);
2220 if (Val.getNode())
2221 return BuildSplat(Val, Op.getValueType(), DAG, dl);
2222 }
2223
2224 return SDValue();
2225 }
2226
2227 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
2228 return Op;
2229 }
2230
2231 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
2232 return Op;
2233 }
2234
2235 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2236 MVT VT = Op.getValueType();
2237 DebugLoc dl = Op.getDebugLoc();
2238 assert((VT == MVT::i8 || VT == MVT::i16) &&
2239 "unexpected type for custom-lowering vector extract");
2240 SDValue Vec = Op.getOperand(0);
2241 SDValue Lane = Op.getOperand(1);
2242 Op = DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
2243 Op = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Op, DAG.getValueType(VT));
2244 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
2245 }
2246
2247 static SDValue LowerCONCAT_VECTORS(SDValue Op) {
2248 if (Op.getValueType().is128BitVector() && Op.getNumOperands() == 2)
2249 return Op;
2250 return SDValue();
16702251 }
16712252
16722253 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
16942275 case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
16952276 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
16962277 case ISD::BIT_CONVERT: return ExpandBIT_CONVERT(Op.getNode(), DAG);
2278 case ISD::SHL:
16972279 case ISD::SRL:
1698 case ISD::SRA: return ExpandSRx(Op.getNode(), DAG,Subtarget);
2280 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
2281 case ISD::VSETCC: return LowerVSETCC(Op, DAG);
2282 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
2283 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
2284 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
2285 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2286 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op);
16992287 }
17002288 return SDValue();
17012289 }
17142302 return;
17152303 case ISD::SRL:
17162304 case ISD::SRA: {
1717 SDValue Res = ExpandSRx(N, DAG, Subtarget);
2305 SDValue Res = LowerShift(N, DAG, Subtarget);
17182306 if (Res.getNode())
17192307 Results.push_back(Res);
17202308 return;
18992487 return SDValue();
19002488 }
19012489
2490 /// getVShiftImm - Check if this is a valid build_vector for the immediate
2491 /// operand of a vector shift operation, where all the elements of the
2492 /// build_vector must have the same constant integer value.
2493 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
2494 // Ignore bit_converts.
2495 while (Op.getOpcode() == ISD::BIT_CONVERT)
2496 Op = Op.getOperand(0);
2497 BuildVectorSDNode *BVN = dyn_cast(Op.getNode());
2498 APInt SplatBits, SplatUndef;
2499 unsigned SplatBitSize;
2500 bool HasAnyUndefs;
2501 if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
2502 HasAnyUndefs, ElementBits) ||
2503 SplatBitSize > ElementBits)
2504 return false;
2505 Cnt = SplatBits.getSExtValue();
2506 return true;
2507 }
2508
2509 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
2510 /// operand of a vector shift left operation. That value must be in the range:
2511 /// 0 <= Value < ElementBits for a left shift; or
2512 /// 0 <= Value <= ElementBits for a long left shift.
2513 static bool isVShiftLImm(SDValue Op, MVT VT, bool isLong, int64_t &Cnt) {
2514 assert(VT.isVector() && "vector shift count is not a vector type");
2515 unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
2516 if (! getVShiftImm(Op, ElementBits, Cnt))
2517 return false;
2518 return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
2519 }
2520
2521 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
2522 /// operand of a vector shift right operation. For a shift opcode, the value
2523 /// is positive, but for an intrinsic the value count must be negative. The
2524 /// absolute value must be in the range:
2525 /// 1 <= |Value| <= ElementBits for a right shift; or
2526 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
2527 static bool isVShiftRImm(SDValue Op, MVT VT, bool isNarrow, bool isIntrinsic,
2528 int64_t &Cnt) {
2529 assert(VT.isVector() && "vector shift count is not a vector type");
2530 unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
2531 if (! getVShiftImm(Op, ElementBits, Cnt))
2532 return false;
2533 if (isIntrinsic)
2534 Cnt = -Cnt;
2535 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
2536 }
2537
2538 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
2539 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
2540 unsigned IntNo = cast(N->getOperand(0))->getZExtValue();
2541 switch (IntNo) {
2542 default:
2543 // Don't do anything for most intrinsics.
2544 break;
2545
2546 // Vector shifts: check for immediate versions and lower them.
2547 // Note: This is done during DAG combining instead of DAG legalizing because
2548 // the build_vectors for 64-bit vector element shift counts are generally
2549 // not legal, and it is hard to see their values after they get legalized to
2550 // loads from a constant pool.
2551 case Intrinsic::arm_neon_vshifts:
2552 case Intrinsic::arm_neon_vshiftu:
2553 case Intrinsic::arm_neon_vshiftls:
2554 case Intrinsic::arm_neon_vshiftlu:
2555 case Intrinsic::arm_neon_vshiftn:
2556 case Intrinsic::arm_neon_vrshifts:
2557 case Intrinsic::arm_neon_vrshiftu:
2558 case Intrinsic::arm_neon_vrshiftn:
2559 case Intrinsic::arm_neon_vqshifts:
2560 case Intrinsic::arm_neon_vqshiftu:
2561 case Intrinsic::arm_neon_vqshiftsu:
2562 case Intrinsic::arm_neon_vqshiftns:
2563 case Intrinsic::arm_neon_vqshiftnu:
2564 case Intrinsic::arm_neon_vqshiftnsu:
2565 case Intrinsic::arm_neon_vqrshiftns:
2566 case Intrinsic::arm_neon_vqrshiftnu:
2567 case Intrinsic::arm_neon_vqrshiftnsu: {
2568 MVT VT = N->getOperand(1).getValueType();
2569 int64_t Cnt;
2570 unsigned VShiftOpc = 0;
2571
2572 switch (IntNo) {
2573 case Intrinsic::arm_neon_vshifts:
2574 case Intrinsic::arm_neon_vshiftu:
2575 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
2576 VShiftOpc = ARMISD::VSHL;
2577 break;
2578 }
2579 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
2580 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
2581 ARMISD::VSHRs : ARMISD::VSHRu);
2582 break;
2583 }
2584 return SDValue();
2585
2586 case Intrinsic::arm_neon_vshiftls:
2587 case Intrinsic::arm_neon_vshiftlu:
2588 if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
2589 break;
2590 assert(0 && "invalid shift count for vshll intrinsic");
2591 abort();
2592
2593 case Intrinsic::arm_neon_vrshifts:
2594 case Intrinsic::arm_neon_vrshiftu:
2595 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
2596 break;
2597 return SDValue();
2598
2599 case Intrinsic::arm_neon_vqshifts:
2600 case Intrinsic::arm_neon_vqshiftu:
2601 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
2602 break;
2603 return SDValue();
2604
2605 case Intrinsic::arm_neon_vqshiftsu:
2606 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
2607 break;
2608 assert(0 && "invalid shift count for vqshlu intrinsic");
2609 abort();
2610
2611 case Intrinsic::arm_neon_vshiftn:
2612 case Intrinsic::arm_neon_vrshiftn:
2613 case Intrinsic::arm_neon_vqshiftns:
2614 case Intrinsic::arm_neon_vqshiftnu:
2615 case Intrinsic::arm_neon_vqshiftnsu:
2616 case Intrinsic::arm_neon_vqrshiftns:
2617 case Intrinsic::arm_neon_vqrshiftnu:
2618 case Intrinsic::arm_neon_vqrshiftnsu:
2619 // Narrowing shifts require an immediate right shift.
2620 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
2621 break;
2622 assert(0 && "invalid shift count for narrowing vector shift intrinsic");
2623 abort();
2624
2625 default:
2626 assert(0 && "unhandled vector shift");
2627 }
2628
2629 switch (IntNo) {
2630 case Intrinsic::arm_neon_vshifts:
2631 case Intrinsic::arm_neon_vshiftu:
2632 // Opcode already set above.
2633 break;
2634 case Intrinsic::arm_neon_vshiftls:
2635 case Intrinsic::arm_neon_vshiftlu:
2636 if (Cnt == VT.getVectorElementType().getSizeInBits())
2637 VShiftOpc = ARMISD::VSHLLi;
2638 else
2639 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
2640 ARMISD::VSHLLs : ARMISD::VSHLLu);
2641 break;
2642 case Intrinsic::arm_neon_vshiftn:
2643 VShiftOpc = ARMISD::VSHRN; break;
2644 case Intrinsic::arm_neon_vrshifts:
2645 VShiftOpc = ARMISD::VRSHRs; break;
2646 case Intrinsic::arm_neon_vrshiftu:
2647 VShiftOpc = ARMISD::VRSHRu; break;
2648 case Intrinsic::arm_neon_vrshiftn:
2649 VShiftOpc = ARMISD::VRSHRN; break;
2650 case Intrinsic::arm_neon_vqshifts:
2651 VShiftOpc = ARMISD::VQSHLs; break;
2652 case Intrinsic::arm_neon_vqshiftu:
2653 VShiftOpc = ARMISD::VQSHLu; break;
2654 case Intrinsic::arm_neon_vqshiftsu:
2655 VShiftOpc = ARMISD::VQSHLsu; break;
2656 case Intrinsic::arm_neon_vqshiftns:
2657 VShiftOpc = ARMISD::VQSHRNs; break;
2658 case Intrinsic::arm_neon_vqshiftnu:
2659 VShiftOpc = ARMISD::VQSHRNu; break;
2660 case Intrinsic::arm_neon_vqshiftnsu:
2661 VShiftOpc = ARMISD::VQSHRNsu; break;
2662 case Intrinsic::arm_neon_vqrshiftns:
2663 VShiftOpc = ARMISD::VQRSHRNs; break;
2664 case Intrinsic::arm_neon_vqrshiftnu:
2665 VShiftOpc = ARMISD::VQRSHRNu; break;
2666 case Intrinsic::arm_neon_vqrshiftnsu:
2667 VShiftOpc = ARMISD::VQRSHRNsu; break;
2668 }
2669
2670 return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
2671 N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
2672 }
2673
2674 case Intrinsic::arm_neon_vshiftins: {
2675 MVT VT = N->getOperand(1).getValueType();
2676 int64_t Cnt;
2677 unsigned VShiftOpc = 0;
2678
2679 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
2680 VShiftOpc = ARMISD::VSLI;
2681 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
2682 VShiftOpc = ARMISD::VSRI;
2683 else {
2684 assert(0 && "invalid shift count for vsli/vsri intrinsic");
2685 abort();
2686 }
2687
2688 return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
2689 N->getOperand(1), N->getOperand(2),
2690 DAG.getConstant(Cnt, MVT::i32));
2691 }
2692
2693 case Intrinsic::arm_neon_vqrshifts:
2694 case Intrinsic::arm_neon_vqrshiftu:
2695 // No immediate versions of these to check for.
2696 break;
2697 }
2698
2699 return SDValue();
2700 }
2701
2702 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
2703 /// lowers them. As with the vector shift intrinsics, this is done during DAG
2704 /// combining instead of DAG legalizing because the build_vectors for 64-bit
2705 /// vector element shift counts are generally not legal, and it is hard to see
2706 /// their values after they get legalized to loads from a constant pool.
2707 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
2708 const ARMSubtarget *ST) {
2709 MVT VT = N->getValueType(0);
2710
2711 // Nothing to be done for scalar shifts.
2712 if (! VT.isVector())
2713 return SDValue();
2714
2715 assert(ST->hasNEON() && "unexpected vector shift");
2716 int64_t Cnt;
2717
2718 switch (N->getOpcode()) {
2719 default: assert(0 && "unexpected shift opcode");
2720
2721 case ISD::SHL:
2722 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
2723 return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
2724 DAG.getConstant(Cnt, MVT::i32));
2725 break;
2726
2727 case ISD::SRA:
2728 case ISD::SRL:
2729 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
2730 unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
2731 ARMISD::VSHRs : ARMISD::VSHRu);
2732 return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
2733 DAG.getConstant(Cnt, MVT::i32));
2734 }
2735 }
2736 return SDValue();
2737 }
2738
2739 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
2740 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
2741 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
2742 const ARMSubtarget *ST) {
2743 SDValue N0 = N->getOperand(0);
2744
2745 // Check for sign- and zero-extensions of vector extract operations of 8-
2746 // and 16-bit vector elements. NEON supports these directly. They are
2747 // handled during DAG combining because type legalization will promote them
2748 // to 32-bit types and it is messy to recognize the operations after that.
2749 if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
2750 SDValue Vec = N0.getOperand(0);
2751 SDValue Lane = N0.getOperand(1);
2752 MVT VT = N->getValueType(0);
2753 MVT EltVT = N0.getValueType();
2754 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2755
2756 if (VT == MVT::i32 &&
2757 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
2758 TLI.isTypeLegal(Vec.getValueType())) {
2759
2760 unsigned Opc = 0;
2761 switch (N->getOpcode()) {
2762 default: assert(0 && "unexpected opcode");
2763 case ISD::SIGN_EXTEND:
2764 Opc = ARMISD::VGETLANEs;
2765 break;
2766 case ISD::ZERO_EXTEND:
2767 case ISD::ANY_EXTEND:
2768 Opc = ARMISD::VGETLANEu;
2769 break;
2770 }
2771 return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
2772 }
2773 }
2774
2775 return SDValue();
2776 }
2777
19022778 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
19032779 DAGCombinerInfo &DCI) const {
19042780 switch (N->getOpcode()) {
19062782 case ISD::ADD: return PerformADDCombine(N, DCI);
19072783 case ISD::SUB: return PerformSUBCombine(N, DCI);
19082784 case ARMISD::FMRRD: return PerformFMRRDCombine(N, DCI);
1909 }
1910
2785 case ISD::INTRINSIC_WO_CHAIN:
2786 return PerformIntrinsicCombine(N, DCI.DAG);
2787 case ISD::SHL:
2788 case ISD::SRA:
2789 case ISD::SRL:
2790 return PerformShiftCombine(N, DCI.DAG, Subtarget);
2791 case ISD::SIGN_EXTEND:
2792 case ISD::ZERO_EXTEND:
2793 case ISD::ANY_EXTEND:
2794 return PerformExtendCombine(N, DCI.DAG, Subtarget);
2795 }
19112796 return SDValue();
19122797 }
19132798
6666 EH_SJLJ_SETJMP, // SjLj exception handling setjmp
6767 EH_SJLJ_LONGJMP, // SjLj exception handling longjmp
6868
69 THREAD_POINTER
69 THREAD_POINTER,
70
71 VCEQ, // Vector compare equal.
72 VCGE, // Vector compare greater than or equal.
73 VCGEU, // Vector compare unsigned greater than or equal.
74 VCGT, // Vector compare greater than.
75 VCGTU, // Vector compare unsigned greater than.
76 VTST, // Vector test bits.
77
78 // Vector shift by immediate:
79 VSHL, // ...left
80 VSHRs, // ...right (signed)
81 VSHRu, // ...right (unsigned)
82 VSHLLs, // ...left long (signed)
83 VSHLLu, // ...left long (unsigned)
84 VSHLLi, // ...left long (with maximum shift count)
85 VSHRN, // ...right narrow
86
87 // Vector rounding shift by immediate:
88 VRSHRs, // ...right (signed)
89 VRSHRu, // ...right (unsigned)
90 VRSHRN, // ...right narrow
91
92 // Vector saturating shift by immediate:
93 VQSHLs, // ...left (signed)
94 VQSHLu, // ...left (unsigned)
95 VQSHLsu, // ...left (signed to unsigned)
96 VQSHRNs, // ...right narrow (signed)
97 VQSHRNu, // ...right narrow (unsigned)
98 VQSHRNsu, // ...right narrow (signed to unsigned)
99
100 // Vector saturating rounding shift by immediate:
101 VQRSHRNs, // ...right narrow (signed)
102 VQRSHRNu, // ...right narrow (unsigned)
103 VQRSHRNsu, // ...right narrow (signed to unsigned)
104
105 // Vector shift and insert:
106 VSLI, // ...left
107 VSRI, // ...right
108
109 // Vector get lane (VMOV scalar to ARM core register)
110 // (These are used for 8- and 16-bit element types only.)
111 VGETLANEu, // zero-extend vector extract element
112 VGETLANEs, // sign-extend vector extract element
113
114 // Vector duplicate lane (128-bit result only; 64-bit is a shuffle)
115 VDUPLANEQ // splat a lane from a 64-bit vector to a 128-bit vector
70116 };
117 }
118
119 /// Define some predicates that are used for node matching.
120 namespace ARM {
121 /// getVMOVImm - If this is a build_vector of constants which can be
122 /// formed by using a VMOV instruction of the specified element size,
123 /// return the constant being splatted. The ByteSize field indicates the
124 /// number of bytes of each element [1248].
125 SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
71126 }
72127
73128 //===--------------------------------------------------------------------===//
149204 /// ARMPCLabelIndex - Keep track the number of ARM PC labels created.
150205 ///
151206 unsigned ARMPCLabelIndex;
207
208 void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
209 void addDRTypeForNEON(MVT VT);
210 void addQRTypeForNEON(MVT VT);
211
212 typedef SmallVector, 8> RegsToPassVector;
213 void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
214 SDValue Chain, SDValue &Arg,
215 RegsToPassVector &RegsToPass,
216 CCValAssign &VA, CCValAssign &NextVA,
217 SDValue &StackPtr,
218 SmallVector &MemOpChains,
219 ISD::ArgFlagsTy Flags);
220 SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
221 SDValue &Root, SelectionDAG &DAG, DebugLoc dl);
152222
153223 CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const;
154224 SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
4747 def VFPMiscFrm : Format<22>;
4848
4949 def ThumbFrm : Format<23>;
50
51 def NEONFrm : Format<24>;
52 def NEONGetLnFrm : Format<25>;
53 def NEONSetLnFrm : Format<26>;
54 def NEONDupFrm : Format<27>;
5055
5156 // Misc flag for data processing instructions that indicates whether
5257 // the instruction has a Rn register operand.
736741 class TJTI pattern>
737742 : ThumbI;
738743
744 // ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
745 class ThumbPat : Pat {
746 list Predicates = [IsThumb];
747 }
748
749 class ThumbV5Pat : Pat {
750 list Predicates = [IsThumb, HasV5T];
751 }
739752
740753 //===----------------------------------------------------------------------===//
741754
856869
857870 //===----------------------------------------------------------------------===//
858871
859
860 // ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
861 class ThumbPat : Pat {
862 list Predicates = [IsThumb];
863 }
864
865 class ThumbV5Pat : Pat {
866 list Predicates = [IsThumb, HasV5T];
867 }
872 //===----------------------------------------------------------------------===//
873 // ARM NEON Instruction templates.
874 //
875
876 class NeonI
877 string cstr, list pattern>
878 : InstARM {
879 let OutOperandList = oops;
880 let InOperandList = iops;
881 let AsmString = asm;
882 let Pattern = pattern;
883 list Predicates = [HasNEON];
884 }
885
886 class NI pattern>
887 : NeonI {
888 }
889
890 class NDataI pattern>
891 : NeonI {
892 let Inst{31-25} = 0b1111001;
893 }
894
895 // NEON "one register and a modified immediate" format.
896 class N1ModImm op21_19, bits<4> op11_8, bit op7, bit op6,
897 bit op5, bit op4,
898 dag oops, dag iops, string asm, string cstr, list pattern>
899 : NDataI {
900 let Inst{23} = op23;
901 let Inst{21-19} = op21_19;
902 let Inst{11-8} = op11_8;
903 let Inst{7} = op7;
904 let Inst{6} = op6;
905 let Inst{5} = op5;
906 let Inst{4} = op4;
907 }
908
909 // NEON 2 vector register format.
910 class N2V op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
911 bits<5> op11_7, bit op6, bit op4,
912 dag oops, dag iops, string asm, string cstr, list pattern>
913 : NDataI {
914 let Inst{24-23} = op24_23;
915 let Inst{21-20} = op21_20;
916 let Inst{19-18} = op19_18;
917 let Inst{17-16} = op17_16;
918 let Inst{11-7} = op11_7;
919 let Inst{6} = op6;
920 let Inst{4} = op4;
921 }
922
923 // NEON 2 vector register with immediate.
924 class N2VImm op21_16, bits<4> op11_8, bit op7,
925 bit op6, bit op4,
926 dag oops, dag iops, string asm, string cstr, list pattern>
927 : NDataI {
928 let Inst{24} = op24;
929 let Inst{23} = op23;
930 let Inst{21-16} = op21_16;
931 let Inst{11-8} = op11_8;
932 let Inst{7} = op7;
933 let Inst{6} = op6;
934 let Inst{4} = op4;
935 }
936
937 // NEON 3 vector register format.
938 class N3V op21_20, bits<4> op11_8, bit op6, bit op4,
939 dag oops, dag iops, string asm, string cstr, list pattern>
940 : NDataI {
941 let Inst{24} = op24;
942 let Inst{23} = op23;
943 let Inst{21-20} = op21_20;
944 let Inst{11-8} = op11_8;
945 let Inst{6} = op6;
946 let Inst{4} = op4;
947 }
948
949 // NEON VMOVs between scalar and core registers.
950 class NVLaneOp opcod1, bits<4> opcod2, bits<2> opcod3,
951 dag oops, dag iops, Format f, string opc, string asm,
952 list pattern>
953 : AI {
954 let Inst{27-20} = opcod1;
955 let Inst{11-8} = opcod2;
956 let Inst{6-5} = opcod3;
957 let Inst{4} = 1;
958 list Predicates = [HasNEON];
959 }
960 class NVGetLane opcod1, bits<4> opcod2, bits<2> opcod3,
961 dag oops, dag iops, string opc, string asm, list pattern>
962 : NVLaneOp
963 pattern>;
964 class NVSetLane opcod1, bits<4> opcod2, bits<2> opcod3,
965 dag oops, dag iops, string opc, string asm, list pattern>
966 : NVLaneOp
967 pattern>;
968 class NVDup opcod1, bits<4> opcod2, bits<2> opcod3,
969 dag oops, dag iops, string opc, string asm, list pattern>
970 : NVLaneOp;
5858 return false;
5959 case ARM::FCPYS:
6060 case ARM::FCPYD:
61 case ARM::VMOVD:
62 case ARM::VMOVQ:
6163 SrcReg = MI.getOperand(1).getReg();
6264 DstReg = MI.getOperand(0).getReg();
6365 return true;
527529 else if (DestRC == ARM::DPRRegisterClass)
528530 AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
529531 .addReg(SrcReg));
532 else if (DestRC == ARM::QPRRegisterClass)
533 BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
530534 else
531535 return false;
532536
843847 case ARM::FCPYS:
844848 case ARM::FCPYD:
845849 return true;
850
851 case ARM::VMOVD:
852 case ARM::VMOVQ:
853 return false; // FIXME
846854 }
847855
848856 return false;
112112
113113 // Thumb format
114114 ThumbFrm = 23 << FormShift,
115
116 // NEON format
117 NEONFrm = 24 << FormShift,
118 NEONGetLnFrm = 25 << FormShift,
119 NEONSetLnFrm = 26 << FormShift,
120 NEONDupFrm = 27 << FormShift,
115121
116122 //===------------------------------------------------------------------===//
117123 // Field shifts - such shifts are used to set field while generating
9292 def HasV5T : Predicate<"Subtarget->hasV5TOps()">;
9393 def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">;
9494 def HasV6 : Predicate<"Subtarget->hasV6Ops()">;
95 def HasV7 : Predicate<"Subtarget->hasV7Ops()">;
96 def HasVFP2 : Predicate<"Subtarget->hasVFP2()">;
97 def HasVFP3 : Predicate<"Subtarget->hasVFP3()">;
98 def HasNEON : Predicate<"Subtarget->hasNEON()">;
9599 def IsThumb : Predicate<"Subtarget->isThumb()">;
96100 def HasThumb2 : Predicate<"Subtarget->hasThumb2()">;
97101 def IsARM : Predicate<"!Subtarget->isThumb()">;
14361440 //
14371441
14381442 include "ARMInstrVFP.td"
1443
1444 //===----------------------------------------------------------------------===//
1445 // Advanced SIMD (NEON) Support
1446 //
1447
1448 include "ARMInstrNEON.td"
0 //===- ARMInstrNEON.td - NEON support for ARM -----------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file describes the ARM NEON instruction set.
10 //
11 //===----------------------------------------------------------------------===//
12
13 //===----------------------------------------------------------------------===//
14 // NEON-specific DAG Nodes.
15 //===----------------------------------------------------------------------===//
16
17 def SDTARMVCMP : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
18
19 def NEONvceq : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
20 def NEONvcge : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
21 def NEONvcgeu : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
22 def NEONvcgt : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
23 def NEONvcgtu : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
24 def NEONvtst : SDNode<"ARMISD::VTST", SDTARMVCMP>;
25
26 // Types for vector shift by immediates. The "SHX" version is for long and
27 // narrow operations where the source and destination vectors have different
28 // types. The "SHINS" version is for shift and insert operations.
29 def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
30 SDTCisVT<2, i32>]>;
31 def SDTARMVSHX : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
32 SDTCisVT<2, i32>]>;
33 def SDTARMVSHINS : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
34 SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
35
36 def NEONvshl : SDNode<"ARMISD::VSHL", SDTARMVSH>;
37 def NEONvshrs : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
38 def NEONvshru : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
39 def NEONvshlls : SDNode<"ARMISD::VSHLLs", SDTARMVSHX>;
40 def NEONvshllu : SDNode<"ARMISD::VSHLLu", SDTARMVSHX>;
41 def NEONvshlli : SDNode<"ARMISD::VSHLLi", SDTARMVSHX>;
42 def NEONvshrn : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
43
44 def NEONvrshrs : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
45 def NEONvrshru : SDNode<"ARMISD::VRSHRu", SDTARMVSH>;
46 def NEONvrshrn : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>;
47
48 def NEONvqshls : SDNode<"ARMISD::VQSHLs", SDTARMVSH>;
49 def NEONvqshlu : SDNode<"ARMISD::VQSHLu", SDTARMVSH>;
50 def NEONvqshlsu : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>;
51 def NEONvqshrns : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>;
52 def NEONvqshrnu : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>;
53 def NEONvqshrnsu : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>;
54
55 def NEONvqrshrns : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>;
56 def NEONvqrshrnu : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>;
57 def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>;
58
59 def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
60 def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
61
62 def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
63 SDTCisVT<2, i32>]>;
64 def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
65 def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
66
67 def NEONvduplaneq : SDNode<"ARMISD::VDUPLANEQ",
68 SDTypeProfile<1, 2, [SDTCisVT<2, i32>]>>;
69
70 //===----------------------------------------------------------------------===//
71 // NEON operand definitions
72 //===----------------------------------------------------------------------===//
73
74 // addrmode_neonldstm := reg
75 //
76 /* TODO: Take advantage of vldm.
77 def addrmode_neonldstm : Operand,
78 ComplexPattern {
79 let PrintMethod = "printAddrNeonLdStMOperand";
80 let MIOperandInfo = (ops GPR, i32imm);
81 }
82 */
83
84 //===----------------------------------------------------------------------===//
85 // NEON load / store instructions
86 //===----------------------------------------------------------------------===//
87
88 /* TODO: Take advantage of vldm.
89 let mayLoad = 1 in {
90 def VLDMD : NI<(outs),
91 (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
92 "vldm${addr:submode} ${addr:base}, $dst1",
93 []>;
94
95 def VLDMS : NI<(outs),
96 (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
97 "vldm${addr:submode} ${addr:base}, $dst1",
98 []>;
99 }
100 */
101
102 // Use vldmia to load a Q register as a D register pair.
103 def VLDRQ : NI<(outs QPR:$dst), (ins GPR:$addr),
104 "vldmia $addr, ${dst:dregpair}",
105 [(set QPR:$dst, (v2f64 (load GPR:$addr)))]>;
106
107 // Use vstmia to store a Q register as a D register pair.
108 def VSTRQ : NI<(outs), (ins QPR:$src, GPR:$addr),
109 "vstmia $addr, ${src:dregpair}",
110 [(store (v2f64 QPR:$src), GPR:$addr)]>;
111
112
113 //===----------------------------------------------------------------------===//
114 // NEON pattern fragments
115 //===----------------------------------------------------------------------===//
116
117 // Extract D sub-registers of Q registers.
118 // (arm_dsubreg_0 is 5; arm_dsubreg_1 is 6)
119 def SubReg_i8_reg : SDNodeXForm
120 return CurDAG->getTargetConstant(5 + N->getZExtValue() / 8, MVT::i32);
121 }]>;
122 def SubReg_i16_reg : SDNodeXForm
123 return CurDAG->getTargetConstant(5 + N->getZExtValue() / 4, MVT::i32);
124 }]>;
125 def SubReg_i32_reg : SDNodeXForm
126 return CurDAG->getTargetConstant(5 + N->getZExtValue() / 2, MVT::i32);
127 }]>;
128 def SubReg_f64_reg : SDNodeXForm
129 return CurDAG->getTargetConstant(5 + N->getZExtValue(), MVT::i32);
130 }]>;
131
132 // Translate lane numbers from Q registers to D subregs.
133 def SubReg_i8_lane : SDNodeXForm
134 return CurDAG->getTargetConstant(N->getZExtValue() & 7, MVT::i32);
135 }]>;
136 def SubReg_i16_lane : SDNodeXForm
137 return CurDAG->getTargetConstant(N->getZExtValue() & 3, MVT::i32);
138 }]>;
139 def SubReg_i32_lane : SDNodeXForm
140 return CurDAG->getTargetConstant(N->getZExtValue() & 1, MVT::i32);
141 }]>;
142
143 //===----------------------------------------------------------------------===//
144 // Instruction Classes
145 //===----------------------------------------------------------------------===//
146
147 // Basic 2-register operations, both double- and quad-register.
148 class N2VD op24_23, bits<2> op21_20, bits<2> op19_18,
149 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
150 ValueType ResTy, ValueType OpTy, SDNode OpNode>
151 : N2V
152 (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
153 [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
154 class N2VQ op24_23, bits<2> op21_20, bits<2> op19_18,
155 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
156 ValueType ResTy, ValueType OpTy, SDNode OpNode>
157 : N2V
158 (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
159 [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
160
161 // Basic 2-register intrinsics, both double- and quad-register.
162 class N2VDInt op24_23, bits<2> op21_20, bits<2> op19_18,
163 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
164 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
165 : N2V
166 (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
167 [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
168 class N2VQInt op24_23, bits<2> op21_20, bits<2> op19_18,
169 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
170 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
171 : N2V
172 (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
173 [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
174
175 // Narrow 2-register intrinsics.
176 class N2VNInt op24_23, bits<2> op21_20, bits<2> op19_18,
177 bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
178 string OpcodeStr, ValueType TyD, ValueType TyQ, Intrinsic IntOp>
179 : N2V
180 (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
181 [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
182
183 // Long 2-register intrinsics. (This is currently only used for VMOVL and is
184 // derived from N2VImm instead of N2V because of the way the size is encoded.)
185 class N2VLInt op21_16, bits<4> op11_8, bit op7,
186 bit op6, bit op4, string OpcodeStr, ValueType TyQ, ValueType TyD,
187 Intrinsic IntOp>
188 : N2VImm
189 (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
190 [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>;
191
192 // Basic 3-register operations, both double- and quad-register.
193 class N3VD op21_20, bits<4> op11_8, bit op4,
194 string OpcodeStr, ValueType ResTy, ValueType OpTy,
195 SDNode OpNode, bit Commutable>
196 : N3V
197 (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
198 !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
199 [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
200 let isCommutable = Commutable;
201 }
202 class N3VQ op21_20, bits<4> op11_8, bit op4,
203 string OpcodeStr, ValueType ResTy, ValueType OpTy,
204 SDNode OpNode, bit Commutable>
205 : N3V
206 (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
207 !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
208 [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
209 let isCommutable = Commutable;
210 }
211
212 // Basic 3-register intrinsics, both double- and quad-register.
213 class N3VDInt op21_20, bits<4> op11_8, bit op4,
214 string OpcodeStr, ValueType ResTy, ValueType OpTy,
215 Intrinsic IntOp, bit Commutable>
216 : N3V
217 (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
218 !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
219 [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
220 let isCommutable = Commutable;
221 }
222 class N3VQInt op21_20, bits<4> op11_8, bit op4,
223 string OpcodeStr, ValueType ResTy, ValueType OpTy,
224 Intrinsic IntOp, bit Commutable>
225 : N3V
226 (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
227 !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
228 [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
229 let isCommutable = Commutable;
230 }
231
232 // Multiply-Add/Sub operations, both double- and quad-register.
233 class N3VDMulOp op21_20, bits<4> op11_8, bit op4,
234 string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
235 : N3V
236 (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
237 !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
238 [(set DPR:$dst, (Ty (OpNode DPR:$src1,
239 (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
240 class N3VQMulOp op21_20, bits<4> op11_8, bit op4,
241 string OpcodeStr, ValueType Ty, SDNode MulOp, SDNode OpNode>
242 : N3V
243 (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
244 !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
245 [(set QPR:$dst, (Ty (OpNode QPR:$src1,
246 (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
247
248 // Neon 3-argument intrinsics, both double- and quad-register.
249 // The destination register is also used as the first source operand register.
250 class N3VDInt3 op21_20, bits<4> op11_8, bit op4,
251 string OpcodeStr, ValueType ResTy, ValueType OpTy,
252 Intrinsic IntOp>
253 : N3V
254 (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
255 !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
256 [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1),
257 (OpTy DPR:$src2), (OpTy DPR:$src3))))]>;
258 class N3VQInt3 op21_20, bits<4> op11_8, bit op4,
259 string OpcodeStr, ValueType ResTy, ValueType OpTy,
260 Intrinsic IntOp>
261 : N3V
262 (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
263 !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
264 [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
265 (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
266
267 // Neon Long 3-argument intrinsic. The destination register is
268 // a quad-register and is also used as the first source operand register.
269 class N3VLInt3 op21_20, bits<4> op11_8, bit op4,
270 string OpcodeStr, ValueType TyQ, ValueType TyD, Intrinsic IntOp>
271 : N3V
272 (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3),
273 !strconcat(OpcodeStr, "\t$dst, $src2, $src3"), "$src1 = $dst",
274 [(set QPR:$dst,
275 (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
276
277 // Narrowing 3-register intrinsics.
278 class N3VNInt op21_20, bits<4> op11_8, bit op4,
279 string OpcodeStr, ValueType TyD, ValueType TyQ,
280 Intrinsic IntOp, bit Commutable>
281 : N3V
282 (outs DPR:$dst), (ins QPR:$src1, QPR:$src2),
283 !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
284 [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> {
285 let isCommutable = Commutable;
286 }
287
288 // Long 3-register intrinsics.
289 class N3VLInt op21_20, bits<4> op11_8, bit op4,
290 string OpcodeStr, ValueType TyQ, ValueType TyD,
291 Intrinsic IntOp, bit Commutable>
292 : N3V
293 (outs QPR:$dst), (ins DPR:$src1, DPR:$src2),
294 !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
295 [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
296 let isCommutable = Commutable;
297 }
298
299 // Wide 3-register intrinsics.
300 class N3VWInt op21_20, bits<4> op11_8, bit op4,
301 string OpcodeStr, ValueType TyQ, ValueType TyD,
302 Intrinsic IntOp, bit Commutable>
303 : N3V
304 (outs QPR:$dst), (ins QPR:$src1, DPR:$src2),
305 !strconcat(OpcodeStr, "\t$dst, $src1, $src2"), "",
306 [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
307 let isCommutable = Commutable;
308 }
309
310 // Pairwise long 2-register intrinsics, both double- and quad-register.
311 class N2VDPLInt op24_23, bits<2> op21_20, bits<2> op19_18,
312 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
313 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
314 : N2V
315 (ins DPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
316 [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
317 class N2VQPLInt op24_23, bits<2> op21_20, bits<2> op19_18,
318 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
319 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
320 : N2V
321 (ins QPR:$src), !strconcat(OpcodeStr, "\t$dst, $src"), "",
322 [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
323
324 // Pairwise long 2-register accumulate intrinsics,
325 // both double- and quad-register.
326 // The destination register is also used as the first source operand register.
327 class N2VDPLInt2 op24_23, bits<2> op21_20, bits<2> op19_18,
328 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
329 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
330 : N2V
331 (outs DPR:$dst), (ins DPR:$src1, DPR:$src2),
332 !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst",
333 [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>;
334 class N2VQPLInt2 op24_23, bits<2> op21_20, bits<2> op19_18,
335 bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
336 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
337 : N2V
338 (outs QPR:$dst), (ins QPR:$src1, QPR:$src2),
339 !strconcat(OpcodeStr, "\t$dst, $src2"), "$src1 = $dst",
340 [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>;
341
342 // Shift by immediate,
343 // both double- and quad-register.
344 class N2VDSh op21_16, bits<4> op11_8, bit op7,
345 bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode>
346 : N2VImm
347 (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM),
348 !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
349 [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
350 class N2VQSh op21_16, bits<4> op11_8, bit op7,
351 bit op4, string OpcodeStr, ValueType Ty, SDNode OpNode>
352 : N2VImm
353 (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM),
354 !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
355 [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
356
357 // Long shift by immediate.
358 class N2VLSh op21_16, bits<4> op11_8, bit op7,
359 bit op6, bit op4, string OpcodeStr, ValueType ResTy,
360 ValueType OpTy, SDNode OpNode>
361 : N2VImm
362 (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM),
363 !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
364 [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
365 (i32 imm:$SIMM))))]>;
366
367 // Narrow shift by immediate.
368 class N2VNSh op21_16, bits<4> op11_8, bit op7,
369 bit op6, bit op4, string OpcodeStr, ValueType ResTy,
370 ValueType OpTy, SDNode OpNode>
371 : N2VImm
372 (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM),
373 !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
374 [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
375 (i32 imm:$SIMM))))]>;
376
377 // Shift right by immediate and accumulate,
378 // both double- and quad-register.
379 class N2VDShAdd op21_16, bits<4> op11_8, bit op7,
380 bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
381 : N2VImm
382 (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
383 !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
384 [(set DPR:$dst, (Ty (add DPR:$src1,
385 (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
386 class N2VQShAdd op21_16, bits<4> op11_8, bit op7,
387 bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
388 : N2VImm
389 (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
390 !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
391 [(set QPR:$dst, (Ty (add QPR:$src1,
392 (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
393
394 // Shift by immediate and insert,
395 // both double- and quad-register.
396 class N2VDShIns op21_16, bits<4> op11_8, bit op7,
397 bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
398 : N2VImm
399 (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, i32imm:$SIMM),
400 !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
401 [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
402 class N2VQShIns op21_16, bits<4> op11_8, bit op7,
403 bit op4, string OpcodeStr, ValueType Ty, SDNode ShOp>
404 : N2VImm
405 (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, i32imm:$SIMM),
406 !strconcat(OpcodeStr, "\t$dst, $src2, $SIMM"), "$src1 = $dst",
407 [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
408
409 // Convert, with fractional bits immediate,
410 // both double- and quad-register.
411 class N2VCvtD op21_16, bits<4> op11_8, bit op7,
412 bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
413 Intrinsic IntOp>
414 : N2VImm
415 (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM),
416 !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
417 [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
418 class N2VCvtQ op21_16, bits<4> op11_8, bit op7,
419 bit op4, string OpcodeStr, ValueType ResTy, ValueType OpTy,
420 Intrinsic IntOp>
421 : N2VImm
422 (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM),
423 !strconcat(OpcodeStr, "\t$dst, $src, $SIMM"), "",
424 [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
425
426 //===----------------------------------------------------------------------===//
427 // Multiclasses
428 //===----------------------------------------------------------------------===//
429
430 // Neon 3-register vector operations.
431
432 // First with only element sizes of 8, 16 and 32 bits:
433 multiclass N3V_QHS op11_8, bit op4,
434 string OpcodeStr, SDNode OpNode, bit Commutable = 0> {
435 // 64-bit vector types.
436 def v8i8 : N3VD
437 v8i8, v8i8, OpNode, Commutable>;
438 def v4i16 : N3VD
439 v4i16, v4i16, OpNode, Commutable>;
440 def v2i32 : N3VD
441 v2i32, v2i32, OpNode, Commutable>;
442
443 // 128-bit vector types.
444 def v16i8 : N3VQ
445 v16i8, v16i8, OpNode, Commutable>;
446 def v8i16 : N3VQ
447 v8i16, v8i16, OpNode, Commutable>;
448 def v4i32 : N3VQ
449 v4i32, v4i32, OpNode, Commutable>;
450 }
451
452 // ....then also with element size 64 bits:
453 multiclass N3V_QHSD op11_8, bit op4,
454 string OpcodeStr, SDNode OpNode, bit Commutable = 0>
455 : N3V_QHS {
456 def v1i64 : N3VD
457 v1i64, v1i64, OpNode, Commutable>;
458 def v2i64 : N3VQ
459 v2i64, v2i64, OpNode, Commutable>;
460 }
461
462
463 // Neon Narrowing 2-register vector intrinsics,
464 // source operand element sizes of 16, 32 and 64 bits:
465 multiclass N2VNInt_HSD op24_23, bits<2> op21_20, bits<2> op17_16,
466 bits<5> op11_7, bit op6, bit op4, string OpcodeStr,
467 Intrinsic IntOp> {
468 def v8i8 : N2VNInt
469 !strconcat(OpcodeStr, "16"), v8i8, v8i16, IntOp>;
470 def v4i16 : N2VNInt
471 !strconcat(OpcodeStr, "32"), v4i16, v4i32, IntOp>;
472 def v2i32 : N2VNInt
473 !strconcat(OpcodeStr, "64"), v2i32, v2i64, IntOp>;
474 }
475
476
477 // Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
478 // source operand element sizes of 16, 32 and 64 bits:
479 multiclass N2VLInt_QHS op11_8, bit op7, bit op6,
480 bit op4, string OpcodeStr, Intrinsic IntOp> {
481 def v8i16 : N2VLInt
482 !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
483 def v4i32 : N2VLInt
484 !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
485 def v2i64 : N2VLInt
486 !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
487 }
488
489
490 // Neon 3-register vector intrinsics.
491
492 // First with only element sizes of 16 and 32 bits:
493 multiclass N3VInt_HS op11_8, bit op4,
494 string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
495 // 64-bit vector types.
496 def v4i16 : N3VDInt
497 v4i16, v4i16, IntOp, Commutable>;
498 def v2i32 : N3VDInt
499 v2i32, v2i32, IntOp, Commutable>;
500
501 // 128-bit vector types.
502 def v8i16 : N3VQInt
503 v8i16, v8i16, IntOp, Commutable>;
504 def v4i32 : N3VQInt
505 v4i32, v4i32, IntOp, Commutable>;
506 }
507
508 // ....then also with element size of 8 bits:
509 multiclass N3VInt_QHS op11_8, bit op4,
510 string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
511 : N3VInt_HS {
512 def v8i8 : N3VDInt
513 v8i8, v8i8, IntOp, Commutable>;
514 def v16i8 : N3VQInt
515 v16i8, v16i8, IntOp, Commutable>;
516 }
517
518 // ....then also with element size of 64 bits:
519 multiclass N3VInt_QHSD op11_8, bit op4,
520 string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
521 : N3VInt_QHS {
522 def v1i64 : N3VDInt
523 v1i64, v1i64, IntOp, Commutable>;
524 def v2i64 : N3VQInt
525 v2i64, v2i64, IntOp, Commutable>;
526 }
527
528
529 // Neon Narrowing 3-register vector intrinsics,
530 // source operand element sizes of 16, 32 and 64 bits:
531 multiclass N3VNInt_HSD op11_8, bit op4,
532 string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
533 def v8i8 : N3VNInt
534 v8i8, v8i16, IntOp, Commutable>;
535 def v4i16 : N3VNInt
536 v4i16, v4i32, IntOp, Commutable>;
537 def v2i32 : N3VNInt
538 v2i32, v2i64, IntOp, Commutable>;
539 }
540
541
542 // Neon Long 3-register vector intrinsics.
543
544 // First with only element sizes of 16 and 32 bits:
545 multiclass N3VLInt_HS op11_8, bit op4,
546 string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
547 def v4i32 : N3VLInt
548 v4i32, v4i16, IntOp, Commutable>;
549 def v2i64 : N3VLInt
550 v2i64, v2i32, IntOp, Commutable>;
551 }
552
553 // ....then also with element size of 8 bits:
554 multiclass N3VLInt_QHS op11_8, bit op4,
555 string OpcodeStr, Intrinsic IntOp, bit Commutable = 0>
556 : N3VLInt_HS {
557 def v8i16 : N3VLInt
558 v8i16, v8i8, IntOp, Commutable>;
559 }
560
561
562 // Neon Wide 3-register vector intrinsics,
563 // source operand element sizes of 8, 16 and 32 bits:
564 multiclass N3VWInt_QHS op11_8, bit op4,
565 string OpcodeStr, Intrinsic IntOp, bit Commutable = 0> {
566 def v8i16 : N3VWInt
567 v8i16, v8i8, IntOp, Commutable>;
568 def v4i32 : N3VWInt
569 v4i32, v4i16, IntOp, Commutable>;
570 def v2i64 : N3VWInt
571 v2i64, v2i32, IntOp, Commutable>;
572 }
573
574
575 // Neon Multiply-Op vector operations,
576 // element sizes of 8, 16 and 32 bits:
577 multiclass N3VMulOp_QHS op11_8, bit op4,
578 string OpcodeStr, SDNode OpNode> {
579 // 64-bit vector types.
580 def v8i8 : N3VDMulOp
581 !strconcat(OpcodeStr, "8"), v8i8, mul, OpNode>;
582 def v4i16 : N3VDMulOp
583 !strconcat(OpcodeStr, "16"), v4i16, mul, OpNode>;
584 def v2i32 : N3VDMulOp
585 !strconcat(OpcodeStr, "32"), v2i32, mul, OpNode>;
586
587 // 128-bit vector types.
588 def v16i8 : N3VQMulOp
589 !strconcat(OpcodeStr, "8"), v16i8, mul, OpNode>;
590 def v8i16 : N3VQMulOp
591 !strconcat(OpcodeStr, "16"), v8i16, mul, OpNode>;
592 def v4i32 : N3VQMulOp
593 !strconcat(OpcodeStr, "32"), v4i32, mul, OpNode>;
594 }
595
596
597 // Neon 3-argument intrinsics,
598 // element sizes of 8, 16 and 32 bits:
599 multiclass N3VInt3_QHS op11_8, bit op4,
600 string OpcodeStr, Intrinsic IntOp> {
601 // 64-bit vector types.
602 def v8i8 : N3VDInt3
603 !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
604 def v4i16 : N3VDInt3
605 !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
606 def v2i32 : N3VDInt3
607 !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
608
609 // 128-bit vector types.
610 def v16i8 : N3VQInt3
611 !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
612 def v8i16 : N3VQInt3
613 !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
614 def v4i32 : N3VQInt3
615 !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
616 }
617
618
619 // Neon Long 3-argument intrinsics.
620
621 // First with only element sizes of 16 and 32 bits:
622 multiclass N3VLInt3_HS op11_8, bit op4,
623 string OpcodeStr, Intrinsic IntOp> {
624 def v4i32 : N3VLInt3
625 !strconcat(OpcodeStr, "16"), v4i32, v4i16, IntOp>;
626 def v2i64 : N3VLInt3
627 !strconcat(OpcodeStr, "32"), v2i64, v2i32, IntOp>;
628 }
629
630 // ....then also with element size of 8 bits:
631 multiclass N3VLInt3_QHS op11_8, bit op4,
632 string OpcodeStr, Intrinsic IntOp>
633 : N3VLInt3_HS {
634 def v8i16 : N3VLInt3
635 !strconcat(OpcodeStr, "8"), v8i16, v8i8, IntOp>;
636 }
637
638
639 // Neon 2-register vector intrinsics,
640 // element sizes of 8, 16 and 32 bits:
641 multiclass N2VInt_QHS op24_23, bits<2> op21_20, bits<2> op17_16,
642 bits<5> op11_7, bit op4, string OpcodeStr,
643 Intrinsic IntOp> {
644 // 64-bit vector types.
645 def v8i8 : N2VDInt
646 !strconcat(OpcodeStr, "8"), v8i8, v8i8, IntOp>;
647 def v4i16 : N2VDInt
648 !strconcat(OpcodeStr, "16"), v4i16, v4i16, IntOp>;
649 def v2i32 : N2VDInt
650 !strconcat(OpcodeStr, "32"), v2i32, v2i32, IntOp>;
651
652 // 128-bit vector types.
653 def v16i8 : N2VQInt
654 !strconcat(OpcodeStr, "8"), v16i8, v16i8, IntOp>;
655 def v8i16 : N2VQInt
656 !strconcat(OpcodeStr, "16"), v8i16, v8i16, IntOp>;
657 def v4i32 : N2VQInt
658 !strconcat(OpcodeStr, "32"), v4i32, v4i32, IntOp>;
659 }
660
661
662 // Neon Pairwise long 2-register intrinsics,
663 // element sizes of 8, 16 and 32 bits:
664 multiclass N2VPLInt_QHS op24_23, bits<2> op21_20, bits<2> op17_16,
665 bits<5> op11_7, bit op4,
666 string OpcodeStr, Intrinsic IntOp> {
667 // 64-bit vector types.
668 def v8i8 : N2VDPLInt
669 !strconcat(OpcodeStr, "8"), v4i16, v8i8, IntOp>;
670 def v4i16 : N2VDPLInt
671 !strconcat(OpcodeStr, "16"), v2i32, v4i16, IntOp>;
672 def v2i32 : N2VDPLInt
673 !strconcat(OpcodeStr, "32"), v1i64, v2i32, IntOp>;
674
675 // 128-bit vector types.
676 def v16i8 : N2VQPLInt
677 !strconcat(OpcodeStr, "8"), v8i16, v16i8, IntOp>;
678 def v8i16 : N2VQPLInt
679 !strconcat(OpcodeStr, "16"), v4i32, v8i16, IntOp>;
680 def v4i32 : N2VQPLInt
681 !strconcat(OpcodeStr, "32"), v2i64, v4i32, IntOp>;
682 }
683
684
685 // Neon Pairwise long 2-register accumulate intrinsics,
686 // element sizes of 8, 16 and 32 bits:
687 multiclass N2VPLInt2_QHS op24_23, bits<2> op21_20, bits<2> op17_16,
688 bits<5> op11_7, bit op4,
689 string OpcodeStr, Intrinsic IntOp> {
690 // 64-bit vector types.
691 def v8i8 : N2VDPLInt2
692 !strconcat(OpcodeStr, "8"), v4i16, v8i8, IntOp>;
693 def v4i16 : N2VDPLInt2
694 !strconcat(OpcodeStr, "16"), v2i32, v4i16, IntOp>;
695 def v2i32 : N2VDPLInt2
696 !strconcat(OpcodeStr, "32"), v1i64, v2i32, IntOp>;
697
698 // 128-bit vector types.
699 def v16i8 : N2VQPLInt2
700 !strconcat(OpcodeStr, "8"), v8i16, v16i8, IntOp>;
701 def v8i16 : N2VQPLInt2
702 !strconcat(OpcodeStr, "16"), v4i32, v8i16, IntOp>;
703 def v4i32 : N2VQPLInt2
704 !strconcat(OpcodeStr, "32"), v2i64, v4i32, IntOp>;
705 }
706
707
708 // Neon 2-register vector shift by immediate,
709 // element sizes of 8, 16, 32 and 64 bits:
710 multiclass N2VSh_QHSD op11_8, bit op4,
711 string OpcodeStr, SDNode OpNode> {
712 // 64-bit vector types.
713 def v8i8 : N2VDSh
714 !strconcat(OpcodeStr, "8"), v8i8, OpNode>;
715 def v4i16 : N2VDSh
716 !strconcat(OpcodeStr, "16"), v4i16, OpNode>;
717 def v2i32 : N2VDSh
718 !strconcat(OpcodeStr, "32"), v2i32, OpNode>;
719 def v1i64 : N2VDSh
720 !strconcat(OpcodeStr, "64"), v1i64, OpNode>;
721
722 // 128-bit vector types.
723 def v16i8 : N2VQSh
724 !strconcat(OpcodeStr, "8"), v16i8, OpNode>;
725 def v8i16 : N2VQSh
726 !strconcat(OpcodeStr, "16"), v8i16, OpNode>;
727 def v4i32 : N2VQSh
728 !strconcat(OpcodeStr, "32"), v4i32, OpNode>;
729 def v2i64 : N2VQSh
730 !strconcat(OpcodeStr, "64"), v2i64, OpNode>;
731 }
732
733
734 // Neon Shift-Accumulate vector operations,
735 // element sizes of 8, 16, 32 and 64 bits:
736 multiclass N2VShAdd_QHSD op11_8, bit op4,
737 string OpcodeStr, SDNode ShOp> {
738 // 64-bit vector types.
739 def v8i8 : N2VDShAdd
740 !strconcat(OpcodeStr, "8"), v8i8, ShOp>;
741 def v4i16 : N2VDShAdd
742 !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
743 def v2i32 : N2VDShAdd
744 !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
745 def v1i64 : N2VDShAdd
746 !strconcat(OpcodeStr, "64"), v1i64, ShOp>;
747
748 // 128-bit vector types.
749 def v16i8 : N2VQShAdd
750 !strconcat(OpcodeStr, "8"), v16i8, ShOp>;
751 def v8i16 : N2VQShAdd
752 !strconcat(OpcodeStr, "16"), v8i16, ShOp>;
753 def v4i32 : N2VQShAdd
754 !strconcat(OpcodeStr, "32"), v4i32, ShOp>;
755 def v2i64 : N2VQShAdd
756 !strconcat(OpcodeStr, "64"), v2i64, ShOp>;
757 }
758
759
760 // Neon Shift-Insert vector operations,
761 // element sizes of 8, 16, 32 and 64 bits:
762 multiclass N2VShIns_QHSD op11_8, bit op4,
763 string OpcodeStr, SDNode ShOp> {
764 // 64-bit vector types.
765 def v8i8 : N2VDShIns
766 !strconcat(OpcodeStr, "8"), v8i8, ShOp>;
767 def v4i16 : N2VDShIns
768 !strconcat(OpcodeStr, "16"), v4i16, ShOp>;
769 def v2i32 : N2VDShIns
770 !strconcat(OpcodeStr, "32"), v2i32, ShOp>;
771 def v1i64 : N2VDShIns
772 !strconcat(OpcodeStr, "64"), v1i64, ShOp>;
773
774 // 128-bit vector types.
775 def v16i8 : N2VQShIns
776 !strconcat(OpcodeStr, "8"), v16i8, ShOp>;
777 def v8i16 : N2VQShIns
778 !strconcat(OpcodeStr, "16"), v8i16, ShOp>;
779 def v4i32 : N2VQShIns
780 !strconcat(OpcodeStr, "32"), v4i32, ShOp>;
781 def v2i64 : N2VQShIns
782 !strconcat(OpcodeStr, "64"), v2i64, ShOp>;
783 }
784
785 //===----------------------------------------------------------------------===//
786 // Instruction Definitions.
787 //===----------------------------------------------------------------------===//
788
789 // Vector Add Operations.
790
791 // VADD : Vector Add (integer and floating-point)
792 defm VADD : N3V_QHSD<0, 0, 0b1000, 0, "vadd.i", add, 1>;
793 def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, "vadd.f32", v2f32, v2f32, fadd, 1>;
794 def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, "vadd.f32", v4f32, v4f32, fadd, 1>;
795 // VADDL : Vector Add Long (Q = D + D)
796 defm VADDLs : N3VLInt_QHS<0,1,0b0000,0, "vaddl.s", int_arm_neon_vaddls, 1>;
797 defm VADDLu : N3VLInt_QHS<1,1,0b0000,0, "vaddl.u", int_arm_neon_vaddlu, 1>;
798 // VADDW : Vector Add Wide (Q = Q + D)
799 defm VADDWs : N3VWInt_QHS<0,1,0b0001,0, "vaddw.s", int_arm_neon_vaddws, 0>;
800 defm VADDWu : N3VWInt_QHS<1,1,0b0001,0, "vaddw.u", int_arm_neon_vaddwu, 0>;
801 // VHADD : Vector Halving Add
802 defm VHADDs : N3VInt_QHS<0,0,0b0000,0, "vhadd.s", int_arm_neon_vhadds, 1>;
803 defm VHADDu : N3VInt_QHS<1,0,0b0000,0, "vhadd.u", int_arm_neon_vhaddu, 1>;
804 // VRHADD : Vector Rounding Halving Add
805 defm VRHADDs : N3VInt_QHS<0,0,0b0001,0, "vrhadd.s", int_arm_neon_vrhadds, 1>;
806 defm VRHADDu : N3VInt_QHS<1,0,0b0001,0, "vrhadd.u", int_arm_neon_vrhaddu, 1>;
807 // VQADD : Vector Saturating Add
808 defm VQADDs : N3VInt_QHSD<0,0,0b0000,1, "vqadd.s", int_arm_neon_vqadds, 1>;
809 defm VQADDu : N3VInt_QHSD<1,0,0b0000,1, "vqadd.u", int_arm_neon_vqaddu, 1>;
810 // VADDHN : Vector Add and Narrow Returning High Half (D = Q + Q)
811 defm VADDHN : N3VNInt_HSD<0,1,0b0100,0, "vaddhn.i", int_arm_neon_vaddhn, 1>;
812 // VRADDHN : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
813 defm VRADDHN : N3VNInt_HSD<1,1,0b0100,0, "vraddhn.i", int_arm_neon_vraddhn, 1>;
814
815 // Vector Multiply Operations.
816
817 // VMUL : Vector Multiply (integer, polynomial and floating-point)
818 defm VMUL : N3V_QHS<0, 0, 0b1001, 1, "vmul.i", mul, 1>;
819 def VMULpd : N3VDInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v8i8, v8i8,
820 int_arm_neon_vmulp, 1>;
821 def VMULpq : N3VQInt<1, 0, 0b00, 0b1001, 1, "vmul.p8", v16i8, v16i8,
822 int_arm_neon_vmulp, 1>;
823 def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, "vmul.f32", v2f32, v2f32, fmul, 1>;
824 def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, "vmul.f32", v4f32, v4f32, fmul, 1>;
825 // VQDMULH : Vector Saturating Doubling Multiply Returning High Half
826 defm VQDMULH : N3VInt_HS<0,0,0b1011,0, "vqdmulh.s", int_arm_neon_vqdmulh, 1>;
827 // VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
828 defm VQRDMULH : N3VInt_HS<1,0,0b1011,0, "vqrdmulh.s", int_arm_neon_vqrdmulh, 1>;
829 // VMULL : Vector Multiply Long (integer and polynomial) (Q = D * D)
830 defm VMULLs : N3VLInt_QHS<0,1,0b1100,0, "vmull.s", int_arm_neon_vmulls, 1>;
831 defm VMULLu : N3VLInt_QHS<1,1,0b1100,0, "vmull.u", int_arm_neon_vmullu, 1>;
832 def VMULLp : N3VLInt<0, 1, 0b00, 0b1110, 0, "vmull.p8", v8i16, v8i8,
833 int_arm_neon_vmullp, 1>;
834 // VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
835 defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, "vqdmull.s", int_arm_neon_vqdmull, 1>;
836
837 // Vector Multiply-Accumulate and Multiply-Subtract Operations.
838
839 // VMLA : Vector Multiply Accumulate (integer and floating-point)
840 defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmla.i", add>;
841 def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v2f32, fmul, fadd>;
842 def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, "vmla.f32", v4f32, fmul, fadd>;
843 // VMLAL : Vector Multiply Accumulate Long (Q += D * D)
844 defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, "vmlal.s", int_arm_neon_vmlals>;
845 defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, "vmlal.u", int_arm_neon_vmlalu>;
846 // VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
847 defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal.s", int_arm_neon_vqdmlal>;
848 // VMLS : Vector Multiply Subtract (integer and floating-point)
849 defm VMLS : N3VMulOp_QHS<0, 0, 0b1001, 0, "vmls.i", sub>;
850 def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v2f32, fmul, fsub>;
851 def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, "vmls.f32", v4f32, fmul, fsub>;
852 // VMLSL : Vector Multiply Subtract Long (Q -= D * D)
853 defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl.s", int_arm_neon_vmlsls>;
854 defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl.u", int_arm_neon_vmlslu>;
855 // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
856 defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl.s", int_arm_neon_vqdmlsl>;
857
858 // Vector Subtract Operations.
859
860 // VSUB : Vector Subtract (integer and floating-point)
861 defm VSUB : N3V_QHSD<1, 0, 0b1000, 0, "vsub.i", sub, 0>;
862 def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, "vsub.f32", v2f32, v2f32, fsub, 0>;
863 def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, "vsub.f32", v4f32, v4f32, fsub, 0>;
864 // VSUBL : Vector Subtract Long (Q = D - D)
865 defm VSUBLs : N3VLInt_QHS<0,1,0b0010,0, "vsubl.s", int_arm_neon_vsubls, 1>;
866 defm VSUBLu : N3VLInt_QHS<1,1,0b0010,0, "vsubl.u", int_arm_neon_vsublu, 1>;
867 // VSUBW : Vector Subtract Wide (Q = Q - D)
868 defm VSUBWs : N3VWInt_QHS<0,1,0b0011,0, "vsubw.s", int_arm_neon_vsubws, 0>;
869 defm VSUBWu : N3VWInt_QHS<1,1,0b0011,0, "vsubw.u", int_arm_neon_vsubwu, 0>;
870 // VHSUB : Vector Halving Subtract
871 defm VHSUBs : N3VInt_QHS<0, 0, 0b0010, 0, "vhsub.s", int_arm_neon_vhsubs, 0>;
872 defm VHSUBu : N3VInt_QHS<1, 0, 0b0010, 0, "vhsub.u", int_arm_neon_vhsubu, 0>;
873 // VQSUB : Vector Saturing Subtract
874 defm VQSUBs : N3VInt_QHSD<0, 0, 0b0010, 1, "vqsub.s", int_arm_neon_vqsubs, 0>;
875 defm VQSUBu : N3VInt_QHSD<1, 0, 0b0010, 1, "vqsub.u", int_arm_neon_vqsubu, 0>;
876 // VSUBHN : Vector Subtract and Narrow Returning High Half (D = Q - Q)
877 defm VSUBHN : N3VNInt_HSD<0,1,0b0110,0, "vsubhn.i", int_arm_neon_vsubhn, 0>;
878 // VRSUBHN : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
879 defm VRSUBHN : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn.i", int_arm_neon_vrsubhn, 0>;
880
881 // Vector Comparisons.
882
883 // VCEQ : Vector Compare Equal
884 defm VCEQ : N3V_QHS<1, 0, 0b1000, 1, "vceq.i", NEONvceq, 1>;
885 def VCEQfd : N3VD<0,0,0b00,0b1110,0, "vceq.f32", v2i32, v2f32, NEONvceq, 1>;
886 def VCEQfq : N3VQ<0,0,0b00,0b1110,0, "vceq.f32", v4i32, v4f32, NEONvceq, 1>;
887 // VCGE : Vector Compare Greater Than or Equal
888 defm VCGEs : N3V_QHS<0, 0, 0b0011, 1, "vcge.s", NEONvcge, 0>;
889 defm VCGEu : N3V_QHS<1, 0, 0b0011, 1, "vcge.u", NEONvcgeu, 0>;
890 def VCGEfd : N3VD<1,0,0b00,0b1110,0, "vcge.f32", v2i32, v2f32, NEONvcge, 0>;
891 def VCGEfq : N3VQ<1,0,0b00,0b1110,0, "vcge.f32", v4i32, v4f32, NEONvcge, 0>;
892 // VCGT : Vector Compare Greater Than
893 defm VCGTs : N3V_QHS<0, 0, 0b0011, 0, "vcgt.s", NEONvcgt, 0>;
894 defm VCGTu : N3V_QHS<1, 0, 0b0011, 0, "vcgt.u", NEONvcgtu, 0>;
895 def VCGTfd : N3VD<1,0,0b10,0b1110,0, "vcgt.f32", v2i32, v2f32, NEONvcgt, 0>;
896 def VCGTfq : N3VQ<1,0,0b10,0b1110,0, "vcgt.f32", v4i32, v4f32, NEONvcgt, 0>;
897 // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
898 def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v2i32, v2f32,
899 int_arm_neon_vacged, 0>;
900 def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, "vacge.f32", v4i32, v4f32,
901 int_arm_neon_vacgeq, 0>;
902 // VACGT : Vector Absolute Compare Greater Than (aka VCAGT)
903 def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v2i32, v2f32,
904 int_arm_neon_vacgtd, 0>;
905 def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, "vacgt.f32", v4i32, v4f32,
906 int_arm_neon_vacgtq, 0>;
907 // VTST : Vector Test Bits
908 defm VTST : N3V_QHS<0, 0, 0b1000, 1, "vtst.i", NEONvtst, 1>;
909
910 // Vector Bitwise Operations.
911
912 // VAND : Vector Bitwise AND
913 def VANDd : N3VD<0, 0, 0b00, 0b0001, 1, "vand", v2i32, v2i32, and, 1>;
914 def VANDq : N3VQ<0, 0, 0b00, 0b0001, 1, "vand", v4i32, v4i32, and, 1>;
915
916 // VEOR : Vector Bitwise Exclusive OR
917 def VEORd : N3VD<1, 0, 0b00, 0b0001, 1, "veor", v2i32, v2i32, xor, 1>;
918 def VEORq : N3VQ<1, 0, 0b00, 0b0001, 1, "veor", v4i32, v4i32, xor, 1>;
919
920 // VORR : Vector Bitwise OR
921 def VORRd : N3VD<0, 0, 0b10, 0b0001, 1, "vorr", v2i32, v2i32, or, 1>;
922 def VORRq : N3VQ<0, 0, 0b10, 0b0001, 1, "vorr", v4i32, v4i32, or, 1>;
923
924 // VBIC : Vector Bitwise Bit Clear (AND NOT)
925 def VBICd : N3V<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
926 (ins DPR:$src1, DPR:$src2), "vbic\t$dst, $src1, $src2", "",
927 [(set DPR:$dst, (v2i32 (and DPR:$src1,(vnot DPR:$src2))))]>;
928 def VBICq : N3V<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
929 (ins QPR:$src1, QPR:$src2), "vbic\t$dst, $src1, $src2", "",
930 [(set QPR:$dst, (v4i32 (and QPR:$src1,(vnot QPR:$src2))))]>;
931
932 // VORN : Vector Bitwise OR NOT
933 def VORNd : N3V<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst),
934 (ins DPR:$src1, DPR:$src2), "vorn\t$dst, $src1, $src2", "",
935 [(set DPR:$dst, (v2i32 (or DPR:$src1, (vnot DPR:$src2))))]>;
936 def VORNq : N3V<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst),
937 (ins QPR:$src1, QPR:$src2), "vorn\t$dst, $src1, $src2", "",
938 [(set QPR:$dst, (v4i32 (or QPR:$src1, (vnot QPR:$src2))))]>;
939
940 // VMVN : Vector Bitwise NOT
941 def VMVNd : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
942 (outs DPR:$dst), (ins DPR:$src), "vmvn\t$dst, $src", "",
943 [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>;
944 def VMVNq : N2V<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
945 (outs QPR:$dst), (ins QPR:$src), "vmvn\t$dst, $src", "",
946 [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>;
947 def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>;
948 def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>;
949
950 // VBSL : Vector Bitwise Select
951 def VBSLd : N3V<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
952 (ins DPR:$src1, DPR:$src2, DPR:$src3),
953 "vbsl\t$dst, $src2, $src3", "$src1 = $dst",
954 [(set DPR:$dst,
955 (v2i32 (or (and DPR:$src2, DPR:$src1),
956 (and DPR:$src3, (vnot DPR:$src1)))))]>;
957 def VBSLq : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
958 (ins QPR:$src1, QPR:$src2, QPR:$src3),
959 "vbsl\t$dst, $src2, $src3", "$src1 = $dst",
960 [(set QPR:$dst,
961 (v4i32 (or (and QPR:$src2, QPR:$src1),
962 (and QPR:$src3, (vnot QPR:$src1)))))]>;
963
964 // VBIF : Vector Bitwise Insert if False
965 // like VBSL but with: "vbif\t$dst, $src3, $src1", "$src2 = $dst",
966 // VBIT : Vector Bitwise Insert if True
967 // like VBSL but with: "vbit\t$dst, $src2, $src1", "$src3 = $dst",
968 // These are not yet implemented. The TwoAddress pass will not go looking
969 // for equivalent operations with different register constraints; it just
970 // inserts copies.
971
972 // Vector Absolute Differences.
973
974 // VABD : Vector Absolute Difference
975 defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>;
976 defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>;
977 def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32,
978 int_arm_neon_vabdf, 0>;
979 def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32,
980 int_arm_neon_vabdf, 0>;
981
982 // VABDL : Vector Absolute Difference Long (Q = | D - D |)
983 defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>;
984 defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, "vabdl.u", int_arm_neon_vabdlu, 0>;
985
986 // VABA : Vector Absolute Difference and Accumulate
987 defm VABAs : N3VInt3_QHS<0,1,0b0101,0, "vaba.s", int_arm_neon_vabas>;
988 defm VABAu : N3VInt3_QHS<1,1,0b0101,0, "vaba.u", int_arm_neon_vabau>;
989
990 // VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
991 defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, "vabal.s", int_arm_neon_vabals>;
992 defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal.u", int_arm_neon_vabalu>;
993
994 // Vector Maximum and Minimum.
995
996 // VMAX : Vector Maximum
997 defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>;
998 defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>;
999 def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32,
1000 int_arm_neon_vmaxf, 1>;
1001 def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32,
1002 int_arm_neon_vmaxf, 1>;
1003
1004 // VMIN : Vector Minimum
1005 defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>;
1006 defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>;
1007 def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32,
1008 int_arm_neon_vminf, 1>;
1009 def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32,
1010 int_arm_neon_vminf, 1>;
1011
1012 // Vector Pairwise Operations.
1013
1014 // VPADD : Vector Pairwise Add
1015 def VPADDi8 : N3VDInt<0, 0, 0b00, 0b1011, 1, "vpadd.i8", v8i8, v8i8,
1016 int_arm_neon_vpaddi, 0>;
1017 def VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, "vpadd.i16", v4i16, v4i16,
1018 int_arm_neon_vpaddi, 0>;
1019 def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, "vpadd.i32", v2i32, v2i32,
1020 int_arm_neon_vpaddi, 0>;
1021 def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, "vpadd.f32", v2f32, v2f32,
1022 int_arm_neon_vpaddf, 0>;
1023
1024 // VPADDL : Vector Pairwise Add Long
1025 defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl.s",
1026 int_arm_neon_vpaddls>;
1027 defm VPADDLu : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl.u",
1028 int_arm_neon_vpaddlu>;
1029
1030 // VPADAL : Vector Pairwise Add and Accumulate Long
1031 defm VPADALs : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpadal.s",
1032 int_arm_neon_vpadals>;
1033 defm VPADALu : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpadal.u",
1034 int_arm_neon_vpadalu>;
1035
1036 // VPMAX : Vector Pairwise Maximum
1037 def VPMAXs8 : N3VDInt<0, 0, 0b00, 0b1010, 0, "vpmax.s8", v8i8, v8i8,
1038 int_arm_neon_vpmaxs, 0>;
1039 def VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, "vpmax.s16", v4i16, v4i16,
1040 int_arm_neon_vpmaxs, 0>;
1041 def VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, "vpmax.s32", v2i32, v2i32,
1042 int_arm_neon_vpmaxs, 0>;
1043 def VPMAXu8 : N3VDInt<1, 0, 0b00, 0b1010, 0, "vpmax.u8", v8i8, v8i8,
1044 int_arm_neon_vpmaxu, 0>;
1045 def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, "vpmax.u16", v4i16, v4i16,
1046 int_arm_neon_vpmaxu, 0>;
1047 def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32,
1048 int_arm_neon_vpmaxu, 0>;
1049 def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32,
1050 int_arm_neon_vpmaxf, 0>;
1051
1052 // VPMIN : Vector Pairwise Minimum
1053 def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8,
1054 int_arm_neon_vpmins, 0>;
1055 def VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, "vpmin.s16", v4i16, v4i16,
1056 int_arm_neon_vpmins, 0>;
1057 def VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, "vpmin.s32", v2i32, v2i32,
1058 int_arm_neon_vpmins, 0>;
1059 def VPMINu8 : N3VDInt<1, 0, 0b00, 0b1010, 1, "vpmin.u8", v8i8, v8i8,
1060 int_arm_neon_vpminu, 0>;
1061 def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, "vpmin.u16", v4i16, v4i16,
1062 int_arm_neon_vpminu, 0>;
1063 def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32,
1064 int_arm_neon_vpminu, 0>;
1065 def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32,
1066 int_arm_neon_vpminf, 0>;
1067
1068 // Vector Reciprocal and Reciprocal Square Root Estimate and Step.
1069
1070 // VRECPE : Vector Reciprocal Estimate
1071 def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
1072 v2i32, v2i32, int_arm_neon_vrecpe>;
1073 def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32",
1074 v4i32, v4i32, int_arm_neon_vrecpe>;
1075 def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
1076 v2f32, v2f32, int_arm_neon_vrecpef>;
1077 def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32",
1078 v4f32, v4f32, int_arm_neon_vrecpef>;
1079
1080 // VRECPS : Vector Reciprocal Step
1081 def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32,
1082 int_arm_neon_vrecps, 1>;
1083 def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v4f32, v4f32,
1084 int_arm_neon_vrecps, 1>;
1085
1086 // VRSQRTE : Vector Reciprocal Square Root Estimate
1087 def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
1088 v2i32, v2i32, int_arm_neon_vrsqrte>;
1089 def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32",
1090 v4i32, v4i32, int_arm_neon_vrsqrte>;
1091 def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
1092 v2f32, v2f32, int_arm_neon_vrsqrtef>;
1093 def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32",
1094 v4f32, v4f32, int_arm_neon_vrsqrtef>;
1095
1096 // VRSQRTS : Vector Reciprocal Square Root Step
1097 def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32,
1098 int_arm_neon_vrsqrts, 1>;
1099 def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v4f32, v4f32,
1100 int_arm_neon_vrsqrts, 1>;
1101
1102 // Vector Shifts.
1103
1104 // VSHL : Vector Shift
1105 defm VSHLs : N3VInt_QHSD<0, 0, 0b0100, 0, "vshl.s", int_arm_neon_vshifts, 0>;
1106 defm VSHLu : N3VInt_QHSD<1, 0, 0b0100, 0, "vshl.u", int_arm_neon_vshiftu, 0>;
1107 // VSHL : Vector Shift Left (Immediate)
1108 defm VSHLi : N2VSh_QHSD<0, 1, 0b0111, 1, "vshl.i", NEONvshl>;
1109 // VSHR : Vector Shift Right (Immediate)
1110 defm VSHRs : N2VSh_QHSD<0, 1, 0b0000, 1, "vshr.s", NEONvshrs>;
1111 defm VSHRu : N2VSh_QHSD<1, 1, 0b0000, 1, "vshr.u", NEONvshru>;
1112
1113 // VSHLL : Vector Shift Left Long
1114 def VSHLLs8 : N2VLSh<0, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.s8",
1115 v8i16, v8i8, NEONvshlls>;
1116 def VSHLLs16 : N2VLSh<0, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.s16",
1117 v4i32, v4i16, NEONvshlls>;
1118 def VSHLLs32 : N2VLSh<0, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.s32",
1119 v2i64, v2i32, NEONvshlls>;
1120 def VSHLLu8 : N2VLSh<1, 1, 0b001000, 0b1010, 0, 0, 1, "vshll.u8",
1121 v8i16, v8i8, NEONvshllu>;
1122 def VSHLLu16 : N2VLSh<1, 1, 0b010000, 0b1010, 0, 0, 1, "vshll.u16",
1123 v4i32, v4i16, NEONvshllu>;
1124 def VSHLLu32 : N2VLSh<1, 1, 0b100000, 0b1010, 0, 0, 1, "vshll.u32",
1125 v2i64, v2i32, NEONvshllu>;
1126
1127 // VSHLL : Vector Shift Left Long (with maximum shift count)
1128 def VSHLLi8 : N2VLSh<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll.i8",
1129 v8i16, v8i8, NEONvshlli>;
1130 def VSHLLi16 : N2VLSh<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll.i16",
1131 v4i32, v4i16, NEONvshlli>;
1132 def VSHLLi32 : N2VLSh<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll.i32",
1133 v2i64, v2i32, NEONvshlli>;
1134
1135 // VSHRN : Vector Shift Right and Narrow
1136 def VSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 0, 1, "vshrn.i16",
1137 v8i8, v8i16, NEONvshrn>;
1138 def VSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 0, 1, "vshrn.i32",
1139 v4i16, v4i32, NEONvshrn>;
1140 def VSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 0, 1, "vshrn.i64",
1141 v2i32, v2i64, NEONvshrn>;
1142
1143 // VRSHL : Vector Rounding Shift
1144 defm VRSHLs : N3VInt_QHSD<0,0,0b0101,0, "vrshl.s", int_arm_neon_vrshifts, 0>;
1145 defm VRSHLu : N3VInt_QHSD<1,0,0b0101,0, "vrshl.u", int_arm_neon_vrshiftu, 0>;
1146 // VRSHR : Vector Rounding Shift Right
1147 defm VRSHRs : N2VSh_QHSD<0, 1, 0b0010, 1, "vrshr.s", NEONvrshrs>;
1148 defm VRSHRu : N2VSh_QHSD<1, 1, 0b0010, 1, "vrshr.u", NEONvrshru>;
1149
1150 // VRSHRN : Vector Rounding Shift Right and Narrow
1151 def VRSHRN16 : N2VNSh<0, 1, 0b001000, 0b1000, 0, 1, 1, "vrshrn.i16",
1152 v8i8, v8i16, NEONvrshrn>;
1153 def VRSHRN32 : N2VNSh<0, 1, 0b010000, 0b1000, 0, 1, 1, "vrshrn.i32",
1154 v4i16, v4i32, NEONvrshrn>;
1155 def VRSHRN64 : N2VNSh<0, 1, 0b100000, 0b1000, 0, 1, 1, "vrshrn.i64",
1156 v2i32, v2i64, NEONvrshrn>;
1157
1158 // VQSHL : Vector Saturating Shift
1159 defm VQSHLs : N3VInt_QHSD<0,0,0b0100,1, "vqshl.s", int_arm_neon_vqshifts, 0>;
1160 defm VQSHLu : N3VInt_QHSD<1,0,0b0100,1, "vqshl.u", int_arm_neon_vqshiftu, 0>;
1161 // VQSHL : Vector Saturating Shift Left (Immediate)
1162 defm VQSHLsi : N2VSh_QHSD<0, 1, 0b0111, 1, "vqshl.s", NEONvqshls>;
1163 defm VQSHLui : N2VSh_QHSD<1, 1, 0b0111, 1, "vqshl.u", NEONvqshlu>;
1164 // VQSHLU : Vector Saturating Shift Left (Immediate, Unsigned)
1165 defm VQSHLsu : N2VSh_QHSD<1, 1, 0b0110, 1, "vqshlu.s", NEONvqshlsu>;
1166
1167 // VQSHRN : Vector Saturating Shift Right and Narrow
1168 def VQSHRNs16 : N2VNSh<0, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.s16",
1169 v8i8, v8i16, NEONvqshrns>;
1170 def VQSHRNs32 : N2VNSh<0, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.s32",
1171 v4i16, v4i32, NEONvqshrns>;
1172 def VQSHRNs64 : N2VNSh<0, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.s64",
1173 v2i32, v2i64, NEONvqshrns>;
1174 def VQSHRNu16 : N2VNSh<1, 1, 0b001000, 0b1001, 0, 0, 1, "vqshrn.u16",
1175 v8i8, v8i16, NEONvqshrnu>;
1176 def VQSHRNu32 : N2VNSh<1, 1, 0b010000, 0b1001, 0, 0, 1, "vqshrn.u32",
1177 v4i16, v4i32, NEONvqshrnu>;
1178 def VQSHRNu64 : N2VNSh<1, 1, 0b100000, 0b1001, 0, 0, 1, "vqshrn.u64",
1179 v2i32, v2i64, NEONvqshrnu>;
1180
1181 // VQSHRUN : Vector Saturating Shift Right and Narrow (Unsigned)
1182 def VQSHRUN16 : N2VNSh<1, 1, 0b001000, 0b1000, 0, 0, 1, "vqshrun.s16",
1183 v8i8, v8i16, NEONvqshrnsu>;
1184 def VQSHRUN32 : N2VNSh<1, 1, 0b010000, 0b1000, 0, 0, 1, "vqshrun.s32",
1185 v4i16, v4i32, NEONvqshrnsu>;
1186 def VQSHRUN64 : N2VNSh<1, 1, 0b100000, 0b1000, 0, 0, 1, "vqshrun.s64",
1187 v2i32, v2i64, NEONvqshrnsu>;
1188
1189 // VQRSHL : Vector Saturating Rounding Shift
1190 defm VQRSHLs : N3VInt_QHSD<0, 0, 0b0101, 1, "vqrshl.s",
1191 int_arm_neon_vqrshifts, 0>;
1192 defm VQRSHLu : N3VInt_QHSD<1, 0, 0b0101, 1, "vqrshl.u",
1193 int_arm_neon_vqrshiftu, 0>;
1194
1195 // VQRSHRN : Vector Saturating Rounding Shift Right and Narrow
1196 def VQRSHRNs16: N2VNSh<0, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.s16",
1197 v8i8, v8i16, NEONvqrshrns>;
1198 def VQRSHRNs32: N2VNSh<0, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.s32",
1199 v4i16, v4i32, NEONvqrshrns>;
1200 def VQRSHRNs64: N2VNSh<0, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.s64",
1201 v2i32, v2i64, NEONvqrshrns>;
1202 def VQRSHRNu16: N2VNSh<1, 1, 0b001000, 0b1001, 0, 1, 1, "vqrshrn.u16",
1203 v8i8, v8i16, NEONvqrshrnu>;
1204 def VQRSHRNu32: N2VNSh<1, 1, 0b010000, 0b1001, 0, 1, 1, "vqrshrn.u32",
1205 v4i16, v4i32, NEONvqrshrnu>;
1206 def VQRSHRNu64: N2VNSh<1, 1, 0b100000, 0b1001, 0, 1, 1, "vqrshrn.u64",
1207 v2i32, v2i64, NEONvqrshrnu>;
1208
1209 // VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
1210 def VQRSHRUN16: N2VNSh<1, 1, 0b001000, 0b1000, 0, 1, 1, "vqrshrun.s16",
1211 v8i8, v8i16, NEONvqrshrnsu>;
1212 def VQRSHRUN32: N2VNSh<1, 1, 0b010000, 0b1000, 0, 1, 1, "vqrshrun.s32",
1213 v4i16, v4i32, NEONvqrshrnsu>;
1214 def VQRSHRUN64: N2VNSh<1, 1, 0b100000, 0b1000, 0, 1, 1, "vqrshrun.s64",
1215 v2i32, v2i64, NEONvqrshrnsu>;
1216
1217 // VSRA : Vector Shift Right and Accumulate
1218 defm VSRAs : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra.s", NEONvshrs>;
1219 defm VSRAu : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra.u", NEONvshru>;
1220 // VRSRA : Vector Rounding Shift Right and Accumulate
1221 defm VRSRAs : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra.s", NEONvrshrs>;
1222 defm VRSRAu : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra.u", NEONvrshru>;
1223
1224 // VSLI : Vector Shift Left and Insert
1225 defm VSLI : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli.", NEONvsli>;
1226 // VSRI : Vector Shift Right and Insert
1227 defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri.", NEONvsri>;
1228
1229 // Vector Absolute and Saturating Absolute.
1230
1231 // VABS : Vector Absolute Value
1232 defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s",
1233 int_arm_neon_vabs>;
1234 def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
1235 v2f32, v2f32, int_arm_neon_vabsf>;
1236 def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32",
1237 v4f32, v4f32, int_arm_neon_vabsf>;
1238
1239 // VQABS : Vector Saturating Absolute Value
1240 defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s",
1241 int_arm_neon_vqabs>;
1242
1243 // Vector Negate.
1244
1245 def vneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
1246 def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>;
1247
1248 class VNEGD size, string OpcodeStr, ValueType Ty>
1249 : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src),
1250 !strconcat(OpcodeStr, "\t$dst, $src"), "",
1251 [(set DPR:$dst, (Ty (vneg DPR:$src)))]>;
1252 class VNEGQ size, string OpcodeStr, ValueType Ty>
1253 : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src),
1254 !strconcat(OpcodeStr, "\t$dst, $src"), "",
1255 [(set QPR:$dst, (Ty (vneg QPR:$src)))]>;
1256
1257 // VNEG : Vector Negate
1258 def VNEGs8d : VNEGD<0b00, "vneg.s8", v8i8>;
1259 def VNEGs16d : VNEGD<0b01, "vneg.s16", v4i16>;
1260 def VNEGs32d : VNEGD<0b10, "vneg.s32", v2i32>;
1261 def VNEGs8q : VNEGQ<0b00, "vneg.s8", v16i8>;
1262 def VNEGs16q : VNEGQ<0b01, "vneg.s16", v8i16>;
1263 def VNEGs32q : VNEGQ<0b10, "vneg.s32", v4i32>;
1264
1265 // VNEG : Vector Negate (floating-point)
1266 def VNEGf32d : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
1267 (outs DPR:$dst), (ins DPR:$src), "vneg.f32\t$dst, $src", "",
1268 [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>;
1269 def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
1270 (outs QPR:$dst), (ins QPR:$src), "vneg.f32\t$dst, $src", "",
1271 [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>;
1272
1273 def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>;
1274 def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>;
1275 def : Pat<(v2i32 (vneg_conv DPR:$src)), (VNEGs32d DPR:$src)>;
1276 def : Pat<(v16i8 (vneg_conv QPR:$src)), (VNEGs8q QPR:$src)>;
1277 def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>;
1278 def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>;
1279
1280 // VQNEG : Vector Saturating Negate
1281 defm VQNEG : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, "vqneg.s",
1282 int_arm_neon_vqneg>;
1283
1284 // Vector Bit Counting Operations.
1285
1286 // VCLS : Vector Count Leading Sign Bits
1287 defm VCLS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, "vcls.s",
1288 int_arm_neon_vcls>;
1289 // VCLZ : Vector Count Leading Zeros
1290 defm VCLZ : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, "vclz.i",
1291 int_arm_neon_vclz>;
1292 // VCNT : Vector Count One Bits
1293 def VCNTd : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
1294 v8i8, v8i8, int_arm_neon_vcnt>;
1295 def VCNTq : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, "vcnt.8",
1296 v16i8, v16i8, int_arm_neon_vcnt>;
1297
1298 // Vector Move Operations.
1299
1300 // VMOV : Vector Move (Register)
1301
1302 def VMOVD : N3V<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src),
1303 "vmov\t$dst, $src", "", []>;
1304 def VMOVQ : N3V<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src),
1305 "vmov\t$dst, $src", "", []>;
1306
1307 // VMOV : Vector Move (Immediate)
1308
1309 // VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm.
1310 def VMOV_get_imm8 : SDNodeXForm
1311 return ARM::getVMOVImm(N, 1, *CurDAG);
1312 }]>;
1313 def vmovImm8 : PatLeaf<(build_vector), [{
1314 return ARM::getVMOVImm(N, 1, *CurDAG).getNode() != 0;
1315 }], VMOV_get_imm8>;
1316
1317 // VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm.
1318 def VMOV_get_imm16 : SDNodeXForm
1319 return ARM::getVMOVImm(N, 2, *CurDAG);
1320 }]>;
1321 def vmovImm16 : PatLeaf<(build_vector), [{
1322 return ARM::getVMOVImm(N, 2, *CurDAG).getNode() != 0;
1323 }], VMOV_get_imm16>;
1324
1325 // VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm.
1326 def VMOV_get_imm32 : SDNodeXForm
1327 return ARM::getVMOVImm(N, 4, *CurDAG);
1328 }]>;
1329 def vmovImm32 : PatLeaf<(build_vector), [{
1330 return ARM::getVMOVImm(N, 4, *CurDAG).getNode() != 0;
1331 }], VMOV_get_imm32>;
1332
1333 // VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm.
1334 def VMOV_get_imm64 : SDNodeXForm
1335 return ARM::getVMOVImm(N, 8, *CurDAG);
1336 }]>;
1337 def vmovImm64 : PatLeaf<(build_vector), [{
1338 return ARM::getVMOVImm(N, 8, *CurDAG).getNode() != 0;
1339 }], VMOV_get_imm64>;
1340
1341 // Note: Some of the cmode bits in the following VMOV instructions need to
1342 // be encoded based on the immed values.
1343
1344 def VMOVv8i8 : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst),
1345 (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "",
1346 [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>;
1347 def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst),
1348 (ins i8imm:$SIMM), "vmov.i8\t$dst, $SIMM", "",