llvm.org GIT mirror llvm / c85bb78
ARM: add patterns for vqdmlal with separate vqdmull and vqadds The vqdmlal and vqdmlls instructions are really just a fused pair consisting of a vqdmull.sN and a vqadd.sN. This adds patterns to LLVM so that we can switch Clang's CodeGen over to generating these instead of the special vqdmlal intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189480 91177308-0d34-0410-b5e6-96231b3b80d8 Tim Northover 6 years ago
2 changed file(s) with 128 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
41424142 "vqdmlal", "s", int_arm_neon_vqdmlal>;
41434143 defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
41444144
4145 def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
4146 (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
4147 (v4i16 DPR:$Vm))))),
4148 (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
4149 def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
4150 (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
4151 (v2i32 DPR:$Vm))))),
4152 (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
4153 def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
4154 (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
4155 (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
4156 imm:$lane)))))),
4157 (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
4158 def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
4159 (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
4160 (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
4161 imm:$lane)))))),
4162 (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
4163
41454164 // VMLS : Vector Multiply Subtract (integer and floating-point)
41464165 defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
41474166 IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
41984217 defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
41994218 "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
42004219 defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
4220
4221 def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
4222 (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
4223 (v4i16 DPR:$Vm))))),
4224 (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
4225 def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
4226 (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
4227 (v2i32 DPR:$Vm))))),
4228 (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
4229 def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
4230 (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
4231 (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
4232 imm:$lane)))))),
4233 (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
4234 def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
4235 (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
4236 (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
4237 imm:$lane)))))),
4238 (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
42014239
42024240 // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
42034241 def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
237237 declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
238238 declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
239239
240 define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
241 ;CHECK-LABEL: vqdmlals16_natural:
242 ;CHECK: vqdmlal.s16
243 %tmp1 = load <4 x i32>* %A
244 %tmp2 = load <4 x i16>* %B
245 %tmp3 = load <4 x i16>* %C
246 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
247 %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
248 ret <4 x i32> %tmp5
249 }
250
251 define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
252 ;CHECK-LABEL: vqdmlals32_natural:
253 ;CHECK: vqdmlal.s32
254 %tmp1 = load <2 x i64>* %A
255 %tmp2 = load <2 x i32>* %B
256 %tmp3 = load <2 x i32>* %C
257 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
258 %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
259 ret <2 x i64> %tmp5
260 }
261
262 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
263 entry:
264 ; CHECK-LABEL: test_vqdmlal_lanes16_natural:
265 ; CHECK: vqdmlal.s16 q0, d2, d3[1]
266 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1]
267 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
268 %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
269 ret <4 x i32> %2
270 }
271
272 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
273 entry:
274 ; CHECK-LABEL: test_vqdmlal_lanes32_natural:
275 ; CHECK: vqdmlal.s32 q0, d2, d3[1]
276 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1]
277 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
278 %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
279 ret <2 x i64> %2
280 }
281
282 declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
283 declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
284
240285 define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
241286 ;CHECK-LABEL: vqdmlsls16:
242287 ;CHECK: vqdmlsl.s16
277322
278323 declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
279324 declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
325
326 define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
327 ;CHECK-LABEL: vqdmlsls16_natural:
328 ;CHECK: vqdmlsl.s16
329 %tmp1 = load <4 x i32>* %A
330 %tmp2 = load <4 x i16>* %B
331 %tmp3 = load <4 x i16>* %C
332 %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
333 %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
334 ret <4 x i32> %tmp5
335 }
336
337 define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
338 ;CHECK-LABEL: vqdmlsls32_natural:
339 ;CHECK: vqdmlsl.s32
340 %tmp1 = load <2 x i64>* %A
341 %tmp2 = load <2 x i32>* %B
342 %tmp3 = load <2 x i32>* %C
343 %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
344 %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
345 ret <2 x i64> %tmp5
346 }
347
348 define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
349 entry:
350 ; CHECK-LABEL: test_vqdmlsl_lanes16_natural:
351 ; CHECK: vqdmlsl.s16 q0, d2, d3[1]
352 %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> ; <<4 x i16>> [#uses=1]
353 %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
354 %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
355 ret <4 x i32> %2
356 }
357
358 define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
359 entry:
360 ; CHECK-LABEL: test_vqdmlsl_lanes32_natural:
361 ; CHECK: vqdmlsl.s32 q0, d2, d3[1]
362 %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> ; <<2 x i32>> [#uses=1]
363 %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
364 %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
365 ret <2 x i64> %2
366 }
367
368 declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
369 declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone