llvm.org GIT mirror llvm / 57d6a5e
- Move all MOVSS and MOVSD patterns close to their definitions - Duplicate some store patterns to their AVX forms! - Catched a bug while restricting the patterns subtarget, fix it and update a testcase to check it properly git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138851 91177308-0d34-0410-b5e6-96231b3b80d8 Bruno Cardoso Lopes 8 years ago
3 changed file(s) with 248 addition(s) and 152 deletion(s). Raw diff Collapse all Expand all
63186318 // this is horrible, but will stay like this until we move all shuffle
63196319 // matching to x86 specific nodes. Note that for the 1st condition all
63206320 // types are matched with movsd.
6321 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
6322 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
6323 else if (HasSSE2)
6321 if (HasSSE2) {
6322 if (NumElems == 2)
6323 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
63246324 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
6325
6325 }
63266326
63276327 assert(VT != MVT::v4i32 && "unsupported shuffle type");
63286328
294294 (SUBREG_TO_REG (i64 0), (AVX_SET0PI), sub_xmm)>;
295295
296296 //===----------------------------------------------------------------------===//
297 // SSE 1 & 2 - Move Instructions
297 // SSE 1 & 2 - Move FP Scalar Instructions
298 //
299 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
300 // register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
301 // is used instead. Register-to-register movss/movsd is not modeled as an
302 // INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
303 // in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
298304 //===----------------------------------------------------------------------===//
299305
300306 class sse12_move_rr :
308314 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
309315 [(set RC:$dst, (mem_pat addr:$src))]>;
310316
311 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
312 // register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
313 // is used instead. Register-to-register movss/movsd is not modeled as an
314 // INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
315 // in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
317 // AVX
316318 def VMOVSSrr : sse12_move_rr
317319 "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V;
318320 def VMOVSDrr : sse12_move_rr
320322
321323 let canFoldAsLoad = 1, isReMaterializable = 1 in {
322324 def VMOVSSrm : sse12_move_rm, XS, VEX;
323
324325 let AddedComplexity = 20 in
325326 def VMOVSDrm : sse12_move_rm, XD, VEX;
326327 }
327328
329 def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
330 "movss\t{$src, $dst|$dst, $src}",
331 [(store FR32:$src, addr:$dst)]>, XS, VEX;
332 def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
333 "movsd\t{$src, $dst|$dst, $src}",
334 [(store FR64:$src, addr:$dst)]>, XD, VEX;
335
336 // SSE1 & 2
328337 let Constraints = "$src1 = $dst" in {
329338 def MOVSSrr : sse12_move_rr
330339 "movss\t{$src2, $dst|$dst, $src2}">, XS;
339348 def MOVSDrm : sse12_move_rm, XD;
340349 }
341350
342 let AddedComplexity = 15 in {
343 // Extract the low 32-bit value from one vector and insert it into another.
344 def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
345 (MOVSSrr (v4f32 VR128:$src1),
346 (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
347 // Extract the low 64-bit value from one vector and insert it into another.
348 def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
349 (MOVSDrr (v2f64 VR128:$src1),
350 (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
351 }
352
353 let AddedComplexity = 20 in {
351 def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
352 "movss\t{$src, $dst|$dst, $src}",
353 [(store FR32:$src, addr:$dst)]>;
354 def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
355 "movsd\t{$src, $dst|$dst, $src}",
356 [(store FR64:$src, addr:$dst)]>;
357
358 // Patterns
354359 let Predicates = [HasSSE1] in {
360 let AddedComplexity = 15 in {
361 // Extract the low 32-bit value from one vector and insert it into another.
362 def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
363 (MOVSSrr (v4f32 VR128:$src1),
364 (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
365 def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
366 (MOVSSrr (v4i32 VR128:$src1),
367 (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
368
369 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
370 // MOVSS to the lower bits.
371 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
372 (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
373 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
374 (MOVSSrr (v4f32 (V_SET0PS)),
375 (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
376 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
377 (MOVSSrr (v4i32 (V_SET0PI)),
378 (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
379 }
380
381 let AddedComplexity = 20 in {
355382 // MOVSSrm zeros the high parts of the register; represent this
356383 // with SUBREG_TO_REG.
357384 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
360387 (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
361388 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
362389 (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
363 }
390 }
391
392 // Extract and store.
393 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
394 addr:$dst),
395 (MOVSSmr addr:$dst,
396 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
397
398 // Shuffle with MOVSS
399 def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
400 (MOVSSrr VR128:$src1, FR32:$src2)>;
401 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
402 (MOVSSrr (v4i32 VR128:$src1),
403 (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
404 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
405 (MOVSSrr (v4f32 VR128:$src1),
406 (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
407 }
408
364409 let Predicates = [HasSSE2] in {
410 let AddedComplexity = 15 in {
411 // Extract the low 64-bit value from one vector and insert it into another.
412 def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
413 (MOVSDrr (v2f64 VR128:$src1),
414 (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
415 def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
416 (MOVSDrr (v2i64 VR128:$src1),
417 (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
418
419 // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
420 def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
421 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
422 def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
423 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
424
425 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
426 // MOVSD to the lower bits.
427 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
428 (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
429 }
430
431 let AddedComplexity = 20 in {
365432 // MOVSDrm zeros the high parts of the register; represent this
366433 // with SUBREG_TO_REG.
367434 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
374441 (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
375442 def : Pat<(v2f64 (X86vzload addr:$src)),
376443 (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
377 }
378 }
379
380 let AddedComplexity = 20, Predicates = [HasAVX] in {
381 // MOVSSrm zeros the high parts of the register; represent this
382 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
383 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
384 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
385 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
386 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
387 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
388 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
389 // MOVSDrm zeros the high parts of the register; represent this
390 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
391 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
392 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
393 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
394 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
395 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
396 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
397 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
398 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
399 def : Pat<(v2f64 (X86vzload addr:$src)),
400 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
401 // Represent the same patterns above but in the form they appear for
402 // 256-bit types
403 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
404 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
405 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
406 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
407 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
408 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>;
409 }
410
411 // Store scalar value to memory.
412 def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
413 "movss\t{$src, $dst|$dst, $src}",
414 [(store FR32:$src, addr:$dst)]>;
415 def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
416 "movsd\t{$src, $dst|$dst, $src}",
417 [(store FR64:$src, addr:$dst)]>;
418
419 def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
420 "movss\t{$src, $dst|$dst, $src}",
421 [(store FR32:$src, addr:$dst)]>, XS, VEX;
422 def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
423 "movsd\t{$src, $dst|$dst, $src}",
424 [(store FR64:$src, addr:$dst)]>, XD, VEX;
425
426 // Extract and store.
427 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
428 addr:$dst),
429 (MOVSSmr addr:$dst,
430 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
431 def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
432 addr:$dst),
433 (MOVSDmr addr:$dst,
434 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
435
436 // Move Aligned/Unaligned floating point values
444 }
445
446 // Extract and store.
447 def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
448 addr:$dst),
449 (MOVSDmr addr:$dst,
450 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
451
452 // Shuffle with MOVSD
453 def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
454 (MOVSDrr VR128:$src1, FR64:$src2)>;
455 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
456 (MOVSDrr (v2i64 VR128:$src1),
457 (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
458 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
459 (MOVSDrr (v2f64 VR128:$src1),
460 (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
461 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
462 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
463 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
464 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
465
466 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
467 // is during lowering, where it's not possible to recognize the fold cause
468 // it has two uses through a bitcast. One use disappears at isel time and the
469 // fold opportunity reappears.
470 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
471 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
472 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
473 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
474 }
475
476 let Predicates = [HasAVX] in {
477 let AddedComplexity = 15 in {
478 // Extract the low 32-bit value from one vector and insert it into another.
479 def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
480 (VMOVSSrr (v4f32 VR128:$src1),
481 (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
482 def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
483 (VMOVSSrr (v4i32 VR128:$src1),
484 (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
485
486 // Extract the low 64-bit value from one vector and insert it into another.
487 def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
488 (VMOVSDrr (v2f64 VR128:$src1),
489 (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
490 def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
491 (VMOVSDrr (v2i64 VR128:$src1),
492 (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
493
494 // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
495 def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
496 (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
497 def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
498 (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
499
500 // Move scalar to XMM zero-extended, zeroing a VR128 then do a
501 // MOVS{S,D} to the lower bits.
502 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
503 (VMOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
504 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
505 (VMOVSSrr (v4f32 (V_SET0PS)),
506 (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
507 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
508 (VMOVSSrr (v4i32 (V_SET0PI)),
509 (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
510 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
511 (VMOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
512 }
513
514 let AddedComplexity = 20 in {
515 // MOVSSrm zeros the high parts of the register; represent this
516 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
517 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
518 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
519 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
520 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
521 def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
522 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
523
524 // MOVSDrm zeros the high parts of the register; represent this
525 // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
526 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
527 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
528 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
529 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
530 def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
531 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
532 def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
533 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
534 def : Pat<(v2f64 (X86vzload addr:$src)),
535 (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
536
537 // Represent the same patterns above but in the form they appear for
538 // 256-bit types
539 def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
540 (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
541 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
542 def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
543 (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
544 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>;
545 }
546
547 // Extract and store.
548 def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
549 addr:$dst),
550 (VMOVSSmr addr:$dst,
551 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
552 def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
553 addr:$dst),
554 (VMOVSDmr addr:$dst,
555 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
556
557 // Shuffle with VMOVSS
558 def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
559 (VMOVSSrr VR128:$src1, FR32:$src2)>;
560 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
561 (VMOVSSrr (v4i32 VR128:$src1),
562 (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
563 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
564 (VMOVSSrr (v4f32 VR128:$src1),
565 (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
566
567 // Shuffle with VMOVSD
568 def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
569 (VMOVSDrr VR128:$src1, FR64:$src2)>;
570 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
571 (VMOVSDrr (v2i64 VR128:$src1),
572 (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
573 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
574 (VMOVSDrr (v2f64 VR128:$src1),
575 (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
576 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
577 (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
578 sub_sd))>;
579 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
580 (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
581 sub_sd))>;
582
583 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
584 // is during lowering, where it's not possible to recognize the fold cause
585 // it has two uses through a bitcast. One use disappears at isel time and the
586 // fold opportunity reappears.
587 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
588 (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
589 sub_sd))>;
590 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
591 (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
592 sub_sd))>;
593 }
594
595 //===----------------------------------------------------------------------===//
596 // SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
597 //===----------------------------------------------------------------------===//
598
437599 multiclass sse12_mov_packed opc, RegisterClass RC,
438600 X86MemOperand x86memop, PatFrag ld_frag,
439601 string asm, Domain d,
43914553 def : Pat<(fextend (loadf32 addr:$src)),
43924554 (CVTSS2SDrm addr:$src)>;
43934555
4394 // Move scalar to XMM zero-extended
4395 // movd to XMM register zero-extends
4396 let AddedComplexity = 15 in {
4397 // Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
4398 def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
4399 (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
4400 def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
4401 (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
4402 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
4403 (MOVSSrr (v4f32 (V_SET0PS)),
4404 (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
4405 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
4406 (MOVSSrr (v4i32 (V_SET0PI)),
4407 (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
4408 }
4409
44104556 // Splat v2f64 / v2i64
44114557 let AddedComplexity = 10 in {
44124558 def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
44354581 (MOVLPSmr addr:$src1, VR128:$src2)>;
44364582 def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
44374583 (MOVLPDmr addr:$src1, VR128:$src2)>;
4438
4439 let AddedComplexity = 15 in {
4440 // Setting the lowest element in the vector.
4441 def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
4442 (MOVSSrr (v4i32 VR128:$src1),
4443 (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
4444 def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
4445 (MOVSDrr (v2i64 VR128:$src1),
4446 (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
4447
4448 // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
4449 def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
4450 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
4451 Requires<[HasSSE2]>;
4452 def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
4453 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
4454 Requires<[HasSSE2]>;
4455 }
44564584
44574585 // Set lowest element and zero upper elements.
44584586 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
61996327 (scalar_to_vector (loadf64 addr:$src2)))),
62006328 (MOVHPDrm VR128:$src1, addr:$src2)>;
62016329
6202 // Shuffle with MOVSS
6203 def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
6204 (MOVSSrr VR128:$src1, FR32:$src2)>;
6205 def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
6206 (MOVSSrr (v4i32 VR128:$src1),
6207 (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
6208 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6209 (MOVSSrr (v4f32 VR128:$src1),
6210 (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
6211
6212 // Shuffle with MOVSD
6213 def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
6214 (MOVSDrr VR128:$src1, FR64:$src2)>;
6215 def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
6216 (MOVSDrr (v2i64 VR128:$src1),
6217 (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
6218 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6219 (MOVSDrr (v2f64 VR128:$src1),
6220 (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
6221 def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
6222 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>;
6223 def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
6224 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>;
6225
62266330 // Shuffle with MOVLPS
62276331 def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
62286332 (MOVLPSrm VR128:$src1, addr:$src2)>;
62316335 def : Pat<(X86Movlps VR128:$src1,
62326336 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
62336337 (MOVLPSrm VR128:$src1, addr:$src2)>;
6234 // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
6235 // is during lowering, where it's not possible to recognize the load fold cause
6236 // it has two uses through a bitcast. One use disappears at isel time and the
6237 // fold opportunity reappears.
6238 def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
6239 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>;
6240
6241 def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
6242 (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>;
62436338
62446339 // Shuffle with MOVLPD
62456340 def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
None ; RUN: llc < %s -march=x86 -mattr=+sse,-sse2
0 ; RUN: llc < %s -march=x86 -mattr=+sse,-sse2 | FileCheck %s
11 ; PR2484
22
33 define <4 x float> @f4523(<4 x float> %a,<4 x float> %b) nounwind {
44 entry:
5 ; CHECK: shufps $-28, %xmm
56 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32>
67 5,i32 2,i32 3>
78 ret <4 x float> %shuffle