llvm.org GIT mirror llvm / 1f1e5c2
[ARM] More aggressive matching for vpadd and vpaddl. The new matchers work after legalization to make them simpler, and to avoid blocking other optimizations. Differential Revision: https://reviews.llvm.org/D27779 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291693 91177308-0d34-0410-b5e6-96231b3b80d8 Eli Friedman 3 years ago
3 changed file(s) with 343 addition(s) and 27 deletion(s). Raw diff Collapse all Expand all
92269226 return SDValue();
92279227 }
92289228
9229 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
9230 // (only after legalization).
9231 static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9229 static bool IsVUZPShuffleNode(SDNode *N) {
9230 // VUZP shuffle node.
9231 if (N->getOpcode() == ARMISD::VUZP)
9232 return true;
9233
9234 // "VUZP" on i32 is an alias for VTRN.
9235 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
9236 return true;
9237
9238 return false;
9239 }
9240
9241 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
92329242 TargetLowering::DAGCombinerInfo &DCI,
92339243 const ARMSubtarget *Subtarget) {
9234
9244 // Look for ADD(VUZP.0, VUZP.1).
9245 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
9246 N0 == N1)
9247 return SDValue();
9248
9249 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
9250 if (!N->getValueType(0).is64BitVector())
9251 return SDValue();
9252
9253 // Generate vpadd.
9254 SelectionDAG &DAG = DCI.DAG;
9255 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9256 SDLoc dl(N);
9257 SDNode *Unzip = N0.getNode();
9258 EVT VT = N->getValueType(0);
9259
9260 SmallVector Ops;
9261 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
9262 TLI.getPointerTy(DAG.getDataLayout())));
9263 Ops.push_back(Unzip->getOperand(0));
9264 Ops.push_back(Unzip->getOperand(1));
9265
9266 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
9267 }
9268
9269 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9270 TargetLowering::DAGCombinerInfo &DCI,
9271 const ARMSubtarget *Subtarget) {
9272 // Check for two extended operands.
9273 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
9274 N1.getOpcode() == ISD::SIGN_EXTEND) &&
9275 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
9276 N1.getOpcode() == ISD::ZERO_EXTEND))
9277 return SDValue();
9278
9279 SDValue N00 = N0.getOperand(0);
9280 SDValue N10 = N1.getOperand(0);
9281
9282 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
9283 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
9284 N00 == N10)
9285 return SDValue();
9286
9287 // We only recognize Q register paddl here; this can't be reached until
9288 // after type legalization.
9289 if (!N00.getValueType().is64BitVector() ||
9290 !N0.getValueType().is128BitVector())
9291 return SDValue();
9292
9293 // Generate vpaddl.
9294 SelectionDAG &DAG = DCI.DAG;
9295 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9296 SDLoc dl(N);
9297 EVT VT = N->getValueType(0);
9298
9299 SmallVector Ops;
9300 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
9301 unsigned Opcode;
9302 if (N0.getOpcode() == ISD::SIGN_EXTEND)
9303 Opcode = Intrinsic::arm_neon_vpaddls;
9304 else
9305 Opcode = Intrinsic::arm_neon_vpaddlu;
9306 Ops.push_back(DAG.getConstant(Opcode, dl,
9307 TLI.getPointerTy(DAG.getDataLayout())));
9308 EVT ElemTy = N00.getValueType().getVectorElementType();
9309 unsigned NumElts = VT.getVectorNumElements();
9310 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
9311 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
9312 N00.getOperand(0), N00.getOperand(1));
9313 Ops.push_back(Concat);
9314
9315 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
9316 }
9317
9318 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
9319 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
9320 // much easier to match.
9321 static SDValue
9322 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
9323 TargetLowering::DAGCombinerInfo &DCI,
9324 const ARMSubtarget *Subtarget) {
92359325 // Only perform optimization if after legalize, and if NEON is available. We
92369326 // also expected both operands to be BUILD_VECTORs.
92379327 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
92869376 } else
92879377 return SDValue();
92889378 }
9379
9380 // Don't generate vpaddl+vmovn; we'll match it to vpadd later.
9381 if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
9382 return SDValue();
92899383
92909384 // Create VPADDL node.
92919385 SelectionDAG &DAG = DCI.DAG;
95589652 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
95599653 TargetLowering::DAGCombinerInfo &DCI,
95609654 const ARMSubtarget *Subtarget){
9655 // Attempt to create vpadd for this add.
9656 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
9657 return Result;
95619658
95629659 // Attempt to create vpaddl for this add.
9563 if (SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget))
9660 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
9661 return Result;
9662 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
9663 Subtarget))
95649664 return Result;
95659665
95669666 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
213213 }
214214
215215 ; Combine vuzp+vadd->vpadd.
216 ; FIXME: Implement this optimization
217 define void @addCombineToVPADD(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
218 ; CHECK-LABEL: addCombineToVPADD:
219 ; CHECK: @ BB#0:
220 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
221 ; CHECK-NEXT: vorr d18, d17, d17
222 ; CHECK-NEXT: vuzp.8 d16, d18
223 ; CHECK-NEXT: vadd.i8 d16, d18, d16
216 define void @addCombineToVPADD_i8(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
217 ; CHECK-LABEL: addCombineToVPADD_i8:
218 ; CHECK: @ BB#0:
219 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
220 ; CHECK-NEXT: vpadd.i8 d16, d16, d17
224221 ; CHECK-NEXT: vstr d16, [r1]
225222 ; CHECK-NEXT: mov pc, lr
226223 %tmp = load <16 x i8>, <16 x i8>* %cbcr
232229 ret void
233230 }
234231
232 ; Combine vuzp+vadd->vpadd.
233 define void @addCombineToVPADD_i16(<8 x i16> *%cbcr, <4 x i16> *%X) nounwind ssp {
234 ; CHECK-LABEL: addCombineToVPADD_i16:
235 ; CHECK: @ BB#0:
236 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
237 ; CHECK-NEXT: vpadd.i16 d16, d16, d17
238 ; CHECK-NEXT: vstr d16, [r1]
239 ; CHECK-NEXT: mov pc, lr
240 %tmp = load <8 x i16>, <8 x i16>* %cbcr
241 %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32>
242 %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32>
243 %add = add <4 x i16> %tmp3, %tmp1
244 store <4 x i16> %add, <4 x i16>* %X, align 8
245 ret void
246 }
247
248 ; Combine vtrn+vadd->vpadd.
249 define void @addCombineToVPADD_i32(<4 x i32> *%cbcr, <2 x i32> *%X) nounwind ssp {
250 ; CHECK-LABEL: addCombineToVPADD_i32:
251 ; CHECK: @ BB#0:
252 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
253 ; CHECK-NEXT: vpadd.i32 d16, d16, d17
254 ; CHECK-NEXT: vstr d16, [r1]
255 ; CHECK-NEXT: mov pc, lr
256 %tmp = load <4 x i32>, <4 x i32>* %cbcr
257 %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32>
258 %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32>
259 %add = add <2 x i32> %tmp3, %tmp1
260 store <2 x i32> %add, <2 x i32>* %X, align 8
261 ret void
262 }
263
235264 ; Combine vuzp+vaddl->vpaddl
236 ; FIXME: Implement this optimization.
237 define void @addCombineToVPADDL_sext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
238 ; CHECK-LABEL: addCombineToVPADDL_sext:
239 ; CHECK: @ BB#0:
240 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
241 ; CHECK-NEXT: vorr d18, d17, d17
242 ; CHECK-NEXT: vuzp.8 d16, d18
243 ; CHECK-NEXT: vaddl.s8 q8, d18, d16
265 define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
266 ; CHECK-LABEL: addCombineToVPADDLq_s8:
267 ; CHECK: @ BB#0:
268 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
269 ; CHECK-NEXT: vpaddl.s8 q8, q8
244270 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
245271 ; CHECK-NEXT: mov pc, lr
246272 %tmp = load <16 x i8>, <16 x i8>* %cbcr
253279 ret void
254280 }
255281
256 ; Legalization produces a EXTRACT_VECTOR_ELT DAG node which performs an extend from
257 ; i16 to i32. In this case the input for the formed VPADDL needs to be a vector of i16s.
258 define <2 x i16> @fromExtendingExtractVectorElt(<4 x i16> %in) {
259 ; CHECK-LABEL: fromExtendingExtractVectorElt:
282 ; Combine vuzp+vaddl->vpaddl
283 ; FIXME: Legalization butchers the shuffles.
284 define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
285 ; CHECK-LABEL: addCombineToVPADDL_s8:
286 ; CHECK: @ BB#0:
287 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
288 ; CHECK-NEXT: vmov.i16 d18, #0x8
289 ; CHECK-NEXT: vneg.s16 d18, d18
290 ; CHECK-NEXT: vext.8 d19, d16, d16, #1
291 ; CHECK-NEXT: vshl.i16 d16, d16, #8
292 ; CHECK-NEXT: vshl.i16 d17, d19, #8
293 ; CHECK-NEXT: vshl.s16 d16, d16, d18
294 ; CHECK-NEXT: vshl.s16 d17, d17, d18
295 ; CHECK-NEXT: vadd.i16 d16, d17, d16
296 ; CHECK-NEXT: vstr d16, [r1]
297 ; CHECK-NEXT: mov pc, lr
298 %tmp = load <16 x i8>, <16 x i8>* %cbcr
299 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32>
300 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32>
301 %tmp4 = sext <4 x i8> %tmp3 to <4 x i16>
302 %tmp5 = sext <4 x i8> %tmp1 to <4 x i16>
303 %add = add <4 x i16> %tmp4, %tmp5
304 store <4 x i16> %add, <4 x i16>* %X, align 8
305 ret void
306 }
307
308 ; Combine vuzp+vaddl->vpaddl
309 define void @addCombineToVPADDLq_u8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
310 ; CHECK-LABEL: addCombineToVPADDLq_u8:
311 ; CHECK: @ BB#0:
312 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
313 ; CHECK-NEXT: vpaddl.u8 q8, q8
314 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
315 ; CHECK-NEXT: mov pc, lr
316 %tmp = load <16 x i8>, <16 x i8>* %cbcr
317 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32>
318 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32>
319 %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
320 %tmp5 = zext <8 x i8> %tmp1 to <8 x i16>
321 %add = add <8 x i16> %tmp4, %tmp5
322 store <8 x i16> %add, <8 x i16>* %X, align 8
323 ret void
324 }
325
326 ; In theory, it's possible to match this to vpaddl, but rearranging the
327 ; shuffle is awkward, so this doesn't match at the moment.
328 define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
329 ; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext:
330 ; CHECK: @ BB#0:
331 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
332 ; CHECK-NEXT: vmovl.u8 q9, d17
333 ; CHECK-NEXT: vmovl.u8 q8, d16
334 ; CHECK-NEXT: vuzp.16 q8, q9
335 ; CHECK-NEXT: vadd.i16 q8, q8, q9
336 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
337 ; CHECK-NEXT: mov pc, lr
338 %tmp = load <16 x i8>, <16 x i8>* %cbcr
339 %tmp1 = zext <16 x i8> %tmp to <16 x i16>
340 %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32>
341 %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32>
342 %add = add <8 x i16> %tmp2, %tmp3
343 store <8 x i16> %add, <8 x i16>* %X, align 8
344 ret void
345 }
346
347 ; Combine vuzp+vaddl->vpaddl
348 ; FIXME: Legalization butchers the shuffle.
349 define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
350 ; CHECK-LABEL: addCombineToVPADDL_u8:
351 ; CHECK: @ BB#0:
352 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
353 ; CHECK-NEXT: vext.8 d18, d16, d16, #1
354 ; CHECK-NEXT: vbic.i16 d16, #0xff00
355 ; CHECK-NEXT: vbic.i16 d18, #0xff00
356 ; CHECK-NEXT: vadd.i16 d16, d18, d16
357 ; CHECK-NEXT: vstr d16, [r1]
358 ; CHECK-NEXT: mov pc, lr
359 %tmp = load <16 x i8>, <16 x i8>* %cbcr
360 %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32>
361 %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32>
362 %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
363 %tmp5 = zext <4 x i8> %tmp1 to <4 x i16>
364 %add = add <4 x i16> %tmp4, %tmp5
365 store <4 x i16> %add, <4 x i16>* %X, align 8
366 ret void
367 }
368
369 ; Matching to vpaddl.8 requires matching shuffle(zext()).
370 define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
371 ; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
372 ; CHECK: @ BB#0:
373 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
374 ; CHECK-NEXT: vmovl.u8 q8, d16
375 ; CHECK-NEXT: vpadd.i16 d16, d16, d17
376 ; CHECK-NEXT: vstr d16, [r1]
377 ; CHECK-NEXT: mov pc, lr
378 %tmp = load <16 x i8>, <16 x i8>* %cbcr
379 %tmp1 = zext <16 x i8> %tmp to <16 x i16>
380 %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32>
381 %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32>
382 %add = add <4 x i16> %tmp2, %tmp3
383 store <4 x i16> %add, <4 x i16>* %X, align 8
384 ret void
385 }
386
387 ; Combine vuzp+vaddl->vpaddl
388 define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
389 ; CHECK-LABEL: addCombineToVPADDLq_s16:
390 ; CHECK: @ BB#0:
391 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
392 ; CHECK-NEXT: vpaddl.s16 q8, q8
393 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
394 ; CHECK-NEXT: mov pc, lr
395 %tmp = load <8 x i16>, <8 x i16>* %cbcr
396 %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32>
397 %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32>
398 %tmp4 = sext <4 x i16> %tmp3 to <4 x i32>
399 %tmp5 = sext <4 x i16> %tmp1 to <4 x i32>
400 %add = add <4 x i32> %tmp4, %tmp5
401 store <4 x i32> %add, <4 x i32>* %X, align 8
402 ret void
403 }
404
405 ; Combine vuzp+vaddl->vpaddl
406 define void @addCombineToVPADDLq_u16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
407 ; CHECK-LABEL: addCombineToVPADDLq_u16:
408 ; CHECK: @ BB#0:
409 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
410 ; CHECK-NEXT: vpaddl.u16 q8, q8
411 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
412 ; CHECK-NEXT: mov pc, lr
413 %tmp = load <8 x i16>, <8 x i16>* %cbcr
414 %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32>
415 %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32>
416 %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
417 %tmp5 = zext <4 x i16> %tmp1 to <4 x i32>
418 %add = add <4 x i32> %tmp4, %tmp5
419 store <4 x i32> %add, <4 x i32>* %X, align 8
420 ret void
421 }
422
423 ; Combine vtrn+vaddl->vpaddl
424 define void @addCombineToVPADDLq_s32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
425 ; CHECK-LABEL: addCombineToVPADDLq_s32:
426 ; CHECK: @ BB#0:
427 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
428 ; CHECK-NEXT: vpaddl.s32 q8, q8
429 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
430 ; CHECK-NEXT: mov pc, lr
431 %tmp = load <4 x i32>, <4 x i32>* %cbcr
432 %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32>
433 %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32>
434 %tmp4 = sext <2 x i32> %tmp3 to <2 x i64>
435 %tmp5 = sext <2 x i32> %tmp1 to <2 x i64>
436 %add = add <2 x i64> %tmp4, %tmp5
437 store <2 x i64> %add, <2 x i64>* %X, align 8
438 ret void
439 }
440
441 ; Combine vtrn+vaddl->vpaddl
442 define void @addCombineToVPADDLq_u32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
443 ; CHECK-LABEL: addCombineToVPADDLq_u32:
444 ; CHECK: @ BB#0:
445 ; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
446 ; CHECK-NEXT: vpaddl.u32 q8, q8
447 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
448 ; CHECK-NEXT: mov pc, lr
449 %tmp = load <4 x i32>, <4 x i32>* %cbcr
450 %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32>
451 %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32>
452 %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
453 %tmp5 = zext <2 x i32> %tmp1 to <2 x i64>
454 %add = add <2 x i64> %tmp4, %tmp5
455 store <2 x i64> %add, <2 x i64>* %X, align 8
456 ret void
457 }
458
459 ; Legalization promotes the <4 x i8> to <4 x i16>.
460 define <4 x i8> @fromExtendingExtractVectorElt_i8(<8 x i8> %in) {
461 ; CHECK-LABEL: fromExtendingExtractVectorElt_i8:
462 ; CHECK: @ BB#0:
463 ; CHECK-NEXT: vmov d16, r0, r1
464 ; CHECK-NEXT: vpaddl.s8 d16, d16
465 ; CHECK-NEXT: vmov r0, r1, d16
466 ; CHECK-NEXT: mov pc, lr
467 %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32>
468 %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32>
469 %x = add <4 x i8> %tmp2, %tmp1
470 ret <4 x i8> %x
471 }
472
473 ; Legalization promotes the <2 x i16> to <2 x i32>.
474 define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
475 ; CHECK-LABEL: fromExtendingExtractVectorElt_i16:
260476 ; CHECK: @ BB#0:
261477 ; CHECK-NEXT: vmov d16, r0, r1
262478 ; CHECK-NEXT: vpaddl.s16 d16, d16
6969 ; CHECK-NEXT: vldr d16, [r1]
7070 ; CHECK-NEXT: vldr d17, [r0]
7171 ; CHECK-NEXT: vtrn.32 d17, d16
72 ; CHECK-NEXT: vadd.i32 d16, d17, d16
72 ; CHECK-NEXT: vmul.i32 d16, d17, d16
7373 ; CHECK-NEXT: vmov r0, r1, d16
7474 ; CHECK-NEXT: mov pc, lr
7575 %tmp1 = load <2 x i32>, <2 x i32>* %A
7676 %tmp2 = load <2 x i32>, <2 x i32>* %B
7777 %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32>
7878 %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32>
79 %tmp5 = add <2 x i32> %tmp3, %tmp4
79 %tmp5 = mul <2 x i32> %tmp3, %tmp4
8080 ret <2 x i32> %tmp5
8181 }
8282