llvm.org GIT mirror llvm / b872fbb
[X86][DAG] Switch X86 Target to post-legalized store merge Move store merge to happen after intrinsic lowering to allow lowered stores to be merged. Some regressions due in MergeConsecutiveStores to missing insert_subvector that are addressed in follow up patch. Reviewers: craig.topper, efriedma, RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D34559 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310710 91177308-0d34-0410-b5e6-96231b3b80d8 Nirav Dave 2 years ago
16 changed file(s) with 183 addition(s) and 243 deletion(s). Raw diff Collapse all Expand all
27222722 bool foldBooleans, DAGCombinerInfo &DCI,
27232723 const SDLoc &dl) const;
27242724
2725 // For targets which wrap address, unwrap for analysis.
2726 virtual SDValue unwrapAddress(SDValue N) const { return N; }
2727
27252728 /// Returns true (and the GlobalValue and the offset) if the node is a
27262729 /// GlobalAddress + offset.
27272730 virtual bool
1313 #include "llvm/CodeGen/MachineFrameInfo.h"
1414 #include "llvm/CodeGen/SelectionDAG.h"
1515 #include "llvm/CodeGen/SelectionDAGNodes.h"
16 #include "llvm/Target/TargetLowering.h"
1617
1718 namespace llvm {
1819
5455 /// Parses tree in Ptr for base, index, offset addresses.
5556 BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) {
5657 // (((B + I*M) + c)) + c ...
57 SDValue Base = Ptr;
58 SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr);
5859 SDValue Index = SDValue();
5960 int64_t Offset = 0;
6061 bool IsIndexSignExt = false;
2703327033 return 1;
2703427034 }
2703527035
27036 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
27037 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
27038 return N->getOperand(0);
27039 return N;
27040 }
27041
2703627042 /// Returns true (and the GlobalValue and the offset) if the node is a
2703727043 /// GlobalAddress + offset.
2703827044 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
811811 /// This method returns the name of a target specific DAG node.
812812 const char *getTargetNodeName(unsigned Opcode) const override;
813813
814 bool mergeStoresAfterLegalization() const override { return true; }
815
814816 bool isCheapToSpeculateCttz() const override;
815817
816818 bool isCheapToSpeculateCtlz() const override;
865867 const APInt &DemandedElts,
866868 const SelectionDAG &DAG,
867869 unsigned Depth) const override;
870
871 SDValue unwrapAddress(SDValue N) const override;
868872
869873 bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
870874 int64_t &Offset) const override;
491491 store float %vecext7, float* %arrayidx7, align 4
492492 ret void
493493
494 ; CHECK-LABEL: merge_vec_element_store
495 ; CHECK: vmovups
496 ; CHECK-NEXT: vzeroupper
497 ; CHECK-NEXT: retq
494 ; CHECK: vextractf128 $1, %ymm0, %xmm1
495 ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
496 ; CHECK: retq
497
498 ; This is what should be generated:
499 ; FIXME-LABEL: merge_vec_element_store
500 ; FIXME: vmovups
501 ; FIXME-NEXT: vzeroupper
502 ; FIXME-NEXT: retq
498503 }
499504
500505 ; PR21711 - Merge vector stores into wider vector stores.
514519 store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
515520 ret void
516521
517 ; CHECK-LABEL: merge_vec_extract_stores
518 ; CHECK: vmovups %ymm0, 48(%rdi)
519 ; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
520 ; CHECK-NEXT: vzeroupper
521 ; CHECK-NEXT: retq
522 ; These vblendpd are obviously redundant.
523 ; CHECK: vblendpd $12, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3]
524 ; CHECK: vmovupd %ymm0, 48(%rdi)
525 ; CHECK: vblendpd $12, %ymm1, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3]
526 ; CHECK: vmovupd %ymm0, 80(%rdi)
527
528 ; This is what should be generated:
529 ; FIXME-LABEL: merge_vec_extract_stores
530 ; FIXME: vmovups %ymm0, 48(%rdi)
531 ; FIXME-NEXT: vmovups %ymm1, 80(%rdi)
532 ; FIXME-NEXT: vzeroupper
533 ; FIXME-NEXT: retq
522534 }
523535
524536 ; Merging vector stores when sourced from vector loads.
556568 }
557569
558570 ; This is a minimized test based on real code that was failing.
559 ; We could merge stores (and loads) like this...
560
571 ; This should now be merged.
561572 define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
562573 %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
563574 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
574585 ret void
575586
576587 ; CHECK-LABEL: merge_vec_element_and_scalar_load
577 ; CHECK: movq (%rdi), %rax
578 ; CHECK-NEXT: movq 8(%rdi), %rcx
579 ; CHECK-NEXT: movq %rax, 32(%rdi)
580 ; CHECK-NEXT: movq %rcx, 40(%rdi)
588 ; CHECK: vmovups (%rdi), %xmm0
589 ; CHECK-NEXT: vmovups %xmm0, 32(%rdi)
581590 ; CHECK-NEXT: retq
582591 }
583592
3030 ret %0 %3
3131 }
3232
33
3334 define fastcc %1 @ReturnBigStruct2() nounwind readnone {
3435 ; X86-LABEL: ReturnBigStruct2:
3536 ; X86: # BB#0: # %entry
3637 ; X86-NEXT: movl $48, 4(%ecx)
3738 ; X86-NEXT: movb $1, 2(%ecx)
38 ; X86-NEXT: movb $1, 1(%ecx)
39 ; X86-NEXT: movb $0, (%ecx)
39 ; X86-NEXT: movw $256, (%ecx) # imm = 0x100
4040 ; X86-NEXT: movl %ecx, %eax
4141 ; X86-NEXT: retl
4242 ;
4444 ; X64: # BB#0: # %entry
4545 ; X64-NEXT: movl $48, 4(%rdi)
4646 ; X64-NEXT: movb $1, 2(%rdi)
47 ; X64-NEXT: movb $1, 1(%rdi)
48 ; X64-NEXT: movb $0, (%rdi)
47 ; X64-NEXT: movw $256, (%rdi) # imm = 0x100
4948 ; X64-NEXT: movq %rdi, %rax
5049 ; X64-NEXT: retq
5150 entry:
1111 ;
1212 ; SLOW-LABEL: foo:
1313 ; SLOW: # BB#0:
14 ; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
15 ; SLOW-NEXT: vpextrq $1, %xmm1, 24(%rdi)
16 ; SLOW-NEXT: vmovq %xmm1, 16(%rdi)
17 ; SLOW-NEXT: vpextrq $1, %xmm0, 8(%rdi)
18 ; SLOW-NEXT: vmovq %xmm0, (%rdi)
14 ; SLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi)
15 ; SLOW-NEXT: vmovups %xmm0, (%rdi)
1916 ; SLOW-NEXT: movq %rdi, %rax
2017 ; SLOW-NEXT: vzeroupper
2118 ; SLOW-NEXT: retq
1414 ;
1515 ; CHECK-LABEL: PR22524:
1616 ; CHECK: # BB#0: # %entry
17 ; CHECK-NEXT: movl $0, 4(%rdi)
1817 ; CHECK-NEXT: xorl %eax, %eax
1918 ; CHECK-NEXT: movd %eax, %xmm0
2019 ; CHECK-NEXT: xorps %xmm1, %xmm1
2120 ; CHECK-NEXT: mulss %xmm0, %xmm1
22 ; CHECK-NEXT: movl $0, (%rdi)
21 ; CHECK-NEXT: movq $0, (%rdi)
2322 ; CHECK-NEXT: movss %xmm1, 4(%rdi)
2423 ; CHECK-NEXT: retq
2524 entry:
509509 }
510510
511511 define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
512 ; X32-LABEL: extract_f128_0:
513 ; X32: # BB#0:
514 ; X32-NEXT: pushl %edi
515 ; X32-NEXT: pushl %esi
516 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
517 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
518 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
519 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
520 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
521 ; X32-NEXT: movl %esi, 12(%edi)
522 ; X32-NEXT: movl %edx, 8(%edi)
523 ; X32-NEXT: movl %ecx, 4(%edi)
524 ; X32-NEXT: movl %eax, (%edi)
525 ; X32-NEXT: popl %esi
526 ; X32-NEXT: popl %edi
527 ; X32-NEXT: retl
512 ; SSE-X32-LABEL: extract_f128_0:
513 ; SSE-X32: # BB#0:
514 ; SSE-X32-NEXT: pushl %edi
515 ; SSE-X32-NEXT: pushl %esi
516 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
517 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
518 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx
519 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi
520 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi
521 ; SSE-X32-NEXT: movl %esi, 12(%edi)
522 ; SSE-X32-NEXT: movl %edx, 8(%edi)
523 ; SSE-X32-NEXT: movl %ecx, 4(%edi)
524 ; SSE-X32-NEXT: movl %eax, (%edi)
525 ; SSE-X32-NEXT: popl %esi
526 ; SSE-X32-NEXT: popl %edi
527 ; SSE-X32-NEXT: retl
528528 ;
529529 ; SSE2-X64-LABEL: extract_f128_0:
530530 ; SSE2-X64: # BB#0:
538538 ; SSE41-X64-NEXT: movq %rsi, (%rdi)
539539 ; SSE41-X64-NEXT: retq
540540 ;
541 ; AVX-X32-LABEL: extract_f128_0:
542 ; AVX-X32: # BB#0:
543 ; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
544 ; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
545 ; AVX-X32-NEXT: vmovups %xmm0, (%eax)
546 ; AVX-X32-NEXT: retl
547 ;
541548 ; AVX-X64-LABEL: extract_f128_0:
542549 ; AVX-X64: # BB#0:
543550 ; AVX-X64-NEXT: movq %rdx, 8(%rdi)
554561 }
555562
556563 define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
557 ; X32-LABEL: extract_f128_1:
558 ; X32: # BB#0:
559 ; X32-NEXT: pushl %edi
560 ; X32-NEXT: pushl %esi
561 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
562 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
563 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
564 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
565 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
566 ; X32-NEXT: movl %esi, 12(%edi)
567 ; X32-NEXT: movl %edx, 8(%edi)
568 ; X32-NEXT: movl %ecx, 4(%edi)
569 ; X32-NEXT: movl %eax, (%edi)
570 ; X32-NEXT: popl %esi
571 ; X32-NEXT: popl %edi
572 ; X32-NEXT: retl
564 ; SSE-X32-LABEL: extract_f128_1:
565 ; SSE-X32: # BB#0:
566 ; SSE-X32-NEXT: pushl %edi
567 ; SSE-X32-NEXT: pushl %esi
568 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
569 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
570 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx
571 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi
572 ; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi
573 ; SSE-X32-NEXT: movl %esi, 12(%edi)
574 ; SSE-X32-NEXT: movl %edx, 8(%edi)
575 ; SSE-X32-NEXT: movl %ecx, 4(%edi)
576 ; SSE-X32-NEXT: movl %eax, (%edi)
577 ; SSE-X32-NEXT: popl %esi
578 ; SSE-X32-NEXT: popl %edi
579 ; SSE-X32-NEXT: retl
573580 ;
574581 ; SSE2-X64-LABEL: extract_f128_1:
575582 ; SSE2-X64: # BB#0:
582589 ; SSE41-X64-NEXT: movq %r8, 8(%rdi)
583590 ; SSE41-X64-NEXT: movq %rcx, (%rdi)
584591 ; SSE41-X64-NEXT: retq
592 ;
593 ; AVX-X32-LABEL: extract_f128_1:
594 ; AVX-X32: # BB#0:
595 ; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
596 ; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
597 ; AVX-X32-NEXT: vmovups %xmm0, (%eax)
598 ; AVX-X32-NEXT: retl
585599 ;
586600 ; AVX-X64-LABEL: extract_f128_1:
587601 ; AVX-X64: # BB#0:
5252 ret <2 x i256> %Shuff
5353
5454 ; X64-LABEL: test_zext1
55 ; X64: movq $0
56 ; X64-NEXT: movq $0
55 ; X64: xorps %xmm0, %xmm0
56 ; X64: movaps %xmm0
57 ; X64: movaps %xmm0
58 ; X64: movaps %xmm0
5759 ; X64-NEXT: movq $0
5860 ; X64-NEXT: movq $254
5961
7476 ret <2 x i256> %Shuff
7577
7678 ; X64-LABEL: test_zext2
77 ; X64: movq $0
78 ; X64-NEXT: movq $0
79 ; X64: xorps %xmm0, %xmm0
80 ; X64-NEXT: movaps %xmm0
81 ; X64-NEXT: movaps %xmm0
82 ; X64-NEXT: movaps %xmm0
7983 ; X64-NEXT: movq $-1
8084 ; X64-NEXT: movq $-2
8185
2525 ;
2626 ; X64-LABEL: test_shl:
2727 ; X64: # BB#0:
28 ; X64-NEXT: movq $0, 56(%rdi)
29 ; X64-NEXT: movq $0, 48(%rdi)
30 ; X64-NEXT: movq $0, 40(%rdi)
31 ; X64-NEXT: movq $0, 32(%rdi)
32 ; X64-NEXT: movq $0, 24(%rdi)
33 ; X64-NEXT: movq $0, 16(%rdi)
34 ; X64-NEXT: movq $0, 8(%rdi)
35 ; X64-NEXT: movq $0, (%rdi)
28 ; X64-NEXT: xorps %xmm0, %xmm0
29 ; X64-NEXT: movaps %xmm0, 48(%rdi)
30 ; X64-NEXT: movaps %xmm0, 32(%rdi)
31 ; X64-NEXT: movaps %xmm0, 16(%rdi)
32 ; X64-NEXT: movaps %xmm0, (%rdi)
3633 ; X64-NEXT: movq %rdi, %rax
3734 ; X64-NEXT: retq
3835 %Amt = insertelement <2 x i256> undef, i256 -1, i32 0
6461 ;
6562 ; X64-LABEL: test_srl:
6663 ; X64: # BB#0:
67 ; X64-NEXT: movq $0, 56(%rdi)
68 ; X64-NEXT: movq $0, 48(%rdi)
69 ; X64-NEXT: movq $0, 40(%rdi)
70 ; X64-NEXT: movq $0, 32(%rdi)
71 ; X64-NEXT: movq $0, 24(%rdi)
72 ; X64-NEXT: movq $0, 16(%rdi)
73 ; X64-NEXT: movq $0, 8(%rdi)
74 ; X64-NEXT: movq $0, (%rdi)
64 ; X64-NEXT: xorps %xmm0, %xmm0
65 ; X64-NEXT: movaps %xmm0, 48(%rdi)
66 ; X64-NEXT: movaps %xmm0, 32(%rdi)
67 ; X64-NEXT: movaps %xmm0, 16(%rdi)
68 ; X64-NEXT: movaps %xmm0, (%rdi)
7569 ; X64-NEXT: movq %rdi, %rax
7670 ; X64-NEXT: retq
7771 %Amt = insertelement <2 x i256> undef, i256 -1, i32 0
525525 ;
526526 ; X32-SSE1-LABEL: merge_8i16_i16_23u567u9:
527527 ; X32-SSE1: # BB#0:
528 ; X32-SSE1-NEXT: pushl %ebp
528 ; X32-SSE1-NEXT: pushl %edi
529529 ; X32-SSE1-NEXT: .Lcfi6:
530530 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
531 ; X32-SSE1-NEXT: pushl %ebx
531 ; X32-SSE1-NEXT: pushl %esi
532532 ; X32-SSE1-NEXT: .Lcfi7:
533533 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
534 ; X32-SSE1-NEXT: pushl %edi
535534 ; X32-SSE1-NEXT: .Lcfi8:
536 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
537 ; X32-SSE1-NEXT: pushl %esi
535 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
538536 ; X32-SSE1-NEXT: .Lcfi9:
539 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 20
540 ; X32-SSE1-NEXT: .Lcfi10:
541 ; X32-SSE1-NEXT: .cfi_offset %esi, -20
542 ; X32-SSE1-NEXT: .Lcfi11:
543 ; X32-SSE1-NEXT: .cfi_offset %edi, -16
544 ; X32-SSE1-NEXT: .Lcfi12:
545 ; X32-SSE1-NEXT: .cfi_offset %ebx, -12
546 ; X32-SSE1-NEXT: .Lcfi13:
547 ; X32-SSE1-NEXT: .cfi_offset %ebp, -8
548 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
549 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
550 ; X32-SSE1-NEXT: movzwl 4(%ecx), %edx
551 ; X32-SSE1-NEXT: movzwl 6(%ecx), %esi
552 ; X32-SSE1-NEXT: movzwl 10(%ecx), %edi
553 ; X32-SSE1-NEXT: movzwl 12(%ecx), %ebx
554 ; X32-SSE1-NEXT: movzwl 14(%ecx), %ebp
537 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
538 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
539 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
540 ; X32-SSE1-NEXT: movl 4(%ecx), %edx
541 ; X32-SSE1-NEXT: movl 10(%ecx), %esi
542 ; X32-SSE1-NEXT: movzwl 14(%ecx), %edi
555543 ; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx
556 ; X32-SSE1-NEXT: movw %bp, 10(%eax)
557 ; X32-SSE1-NEXT: movw %bx, 8(%eax)
544 ; X32-SSE1-NEXT: movw %di, 10(%eax)
558545 ; X32-SSE1-NEXT: movw %cx, 14(%eax)
559 ; X32-SSE1-NEXT: movw %si, 2(%eax)
560 ; X32-SSE1-NEXT: movw %dx, (%eax)
561 ; X32-SSE1-NEXT: movw %di, 6(%eax)
546 ; X32-SSE1-NEXT: movl %edx, (%eax)
547 ; X32-SSE1-NEXT: movl %esi, 6(%eax)
562548 ; X32-SSE1-NEXT: popl %esi
563549 ; X32-SSE1-NEXT: popl %edi
564 ; X32-SSE1-NEXT: popl %ebx
565 ; X32-SSE1-NEXT: popl %ebp
566550 ; X32-SSE1-NEXT: retl $4
567551 ;
568552 ; X32-SSE41-LABEL: merge_8i16_i16_23u567u9:
606590 ; X32-SSE1: # BB#0:
607591 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
608592 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
609 ; X32-SSE1-NEXT: movzwl 6(%ecx), %edx
610 ; X32-SSE1-NEXT: movzwl 8(%ecx), %ecx
611 ; X32-SSE1-NEXT: movw %cx, 2(%eax)
612 ; X32-SSE1-NEXT: movw %dx, (%eax)
593 ; X32-SSE1-NEXT: movl 6(%ecx), %ecx
594 ; X32-SSE1-NEXT: movl %ecx, (%eax)
613595 ; X32-SSE1-NEXT: retl $4
614596 ;
615597 ; X32-SSE41-LABEL: merge_8i16_i16_34uuuuuu:
639621 ;
640622 ; X32-SSE1-LABEL: merge_8i16_i16_45u7zzzz:
641623 ; X32-SSE1: # BB#0:
642 ; X32-SSE1-NEXT: pushl %esi
643 ; X32-SSE1-NEXT: .Lcfi14:
644 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
645 ; X32-SSE1-NEXT: .Lcfi15:
646 ; X32-SSE1-NEXT: .cfi_offset %esi, -8
647 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
648 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
649 ; X32-SSE1-NEXT: movzwl 8(%ecx), %edx
650 ; X32-SSE1-NEXT: movzwl 10(%ecx), %esi
624 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
625 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
626 ; X32-SSE1-NEXT: movl 8(%ecx), %edx
651627 ; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx
652 ; X32-SSE1-NEXT: movw %si, 2(%eax)
653 ; X32-SSE1-NEXT: movw %dx, (%eax)
628 ; X32-SSE1-NEXT: movl %edx, (%eax)
654629 ; X32-SSE1-NEXT: movw %cx, 6(%eax)
655 ; X32-SSE1-NEXT: movw $0, 14(%eax)
656 ; X32-SSE1-NEXT: movw $0, 12(%eax)
657 ; X32-SSE1-NEXT: movw $0, 10(%eax)
658 ; X32-SSE1-NEXT: movw $0, 8(%eax)
659 ; X32-SSE1-NEXT: popl %esi
630 ; X32-SSE1-NEXT: movl $0, 12(%eax)
631 ; X32-SSE1-NEXT: movl $0, 8(%eax)
660632 ; X32-SSE1-NEXT: retl $4
661633 ;
662634 ; X32-SSE41-LABEL: merge_8i16_i16_45u7zzzz:
693665 ;
694666 ; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
695667 ; X32-SSE1: # BB#0:
668 ; X32-SSE1-NEXT: pushl %ebp
669 ; X32-SSE1-NEXT: .Lcfi10:
670 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
696671 ; X32-SSE1-NEXT: pushl %ebx
672 ; X32-SSE1-NEXT: .Lcfi11:
673 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
674 ; X32-SSE1-NEXT: pushl %edi
675 ; X32-SSE1-NEXT: .Lcfi12:
676 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
677 ; X32-SSE1-NEXT: pushl %esi
678 ; X32-SSE1-NEXT: .Lcfi13:
679 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 20
680 ; X32-SSE1-NEXT: .Lcfi14:
681 ; X32-SSE1-NEXT: .cfi_offset %esi, -20
682 ; X32-SSE1-NEXT: .Lcfi15:
683 ; X32-SSE1-NEXT: .cfi_offset %edi, -16
697684 ; X32-SSE1-NEXT: .Lcfi16:
698 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
699 ; X32-SSE1-NEXT: subl $12, %esp
685 ; X32-SSE1-NEXT: .cfi_offset %ebx, -12
700686 ; X32-SSE1-NEXT: .Lcfi17:
701 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 20
702 ; X32-SSE1-NEXT: .Lcfi18:
703 ; X32-SSE1-NEXT: .cfi_offset %ebx, -8
704 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
705 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
706 ; X32-SSE1-NEXT: movb (%ecx), %dl
707 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
708 ; X32-SSE1-NEXT: movb 1(%ecx), %dl
709 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
710 ; X32-SSE1-NEXT: movb 3(%ecx), %dl
711 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
712 ; X32-SSE1-NEXT: movb 4(%ecx), %dl
713 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
714 ; X32-SSE1-NEXT: movb 5(%ecx), %dl
715 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
716 ; X32-SSE1-NEXT: movb 6(%ecx), %dl
717 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
718 ; X32-SSE1-NEXT: movb 7(%ecx), %dl
719 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
720 ; X32-SSE1-NEXT: movb 8(%ecx), %dl
721 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
722 ; X32-SSE1-NEXT: movb 9(%ecx), %dl
723 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
724 ; X32-SSE1-NEXT: movb 10(%ecx), %bh
725 ; X32-SSE1-NEXT: movb 11(%ecx), %bl
726 ; X32-SSE1-NEXT: movb 12(%ecx), %dh
687 ; X32-SSE1-NEXT: .cfi_offset %ebp, -8
688 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
689 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
690 ; X32-SSE1-NEXT: movzwl (%ecx), %ebp
691 ; X32-SSE1-NEXT: movl 3(%ecx), %esi
692 ; X32-SSE1-NEXT: movl 7(%ecx), %edi
693 ; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx
727694 ; X32-SSE1-NEXT: movb 13(%ecx), %dl
728695 ; X32-SSE1-NEXT: movb 15(%ecx), %cl
729696 ; X32-SSE1-NEXT: movb %dl, 13(%eax)
730 ; X32-SSE1-NEXT: movb %dh, 12(%eax)
731697 ; X32-SSE1-NEXT: movb %cl, 15(%eax)
732 ; X32-SSE1-NEXT: movb %bl, 11(%eax)
733 ; X32-SSE1-NEXT: movb %bh, 10(%eax)
734 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
735 ; X32-SSE1-NEXT: movb %cl, 9(%eax)
736 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
737 ; X32-SSE1-NEXT: movb %cl, 8(%eax)
738 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
739 ; X32-SSE1-NEXT: movb %cl, 7(%eax)
740 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
741 ; X32-SSE1-NEXT: movb %cl, 6(%eax)
742 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
743 ; X32-SSE1-NEXT: movb %cl, 5(%eax)
744 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
745 ; X32-SSE1-NEXT: movb %cl, 4(%eax)
746 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
747 ; X32-SSE1-NEXT: movb %cl, 1(%eax)
748 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
749 ; X32-SSE1-NEXT: movb %cl, (%eax)
750 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
751 ; X32-SSE1-NEXT: movb %cl, 3(%eax)
752 ; X32-SSE1-NEXT: addl $12, %esp
698 ; X32-SSE1-NEXT: movw %bx, 11(%eax)
699 ; X32-SSE1-NEXT: movl %edi, 7(%eax)
700 ; X32-SSE1-NEXT: movw %bp, (%eax)
701 ; X32-SSE1-NEXT: movl %esi, 3(%eax)
702 ; X32-SSE1-NEXT: popl %esi
703 ; X32-SSE1-NEXT: popl %edi
753704 ; X32-SSE1-NEXT: popl %ebx
705 ; X32-SSE1-NEXT: popl %ebp
754706 ; X32-SSE1-NEXT: retl $4
755707 ;
756708 ; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
818770 ; X32-SSE1: # BB#0:
819771 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
820772 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
821 ; X32-SSE1-NEXT: movb (%ecx), %dl
822 ; X32-SSE1-NEXT: movb 1(%ecx), %dh
773 ; X32-SSE1-NEXT: movzwl (%ecx), %edx
823774 ; X32-SSE1-NEXT: movb 3(%ecx), %cl
824 ; X32-SSE1-NEXT: movb %dh, 1(%eax)
825 ; X32-SSE1-NEXT: movb %dl, (%eax)
775 ; X32-SSE1-NEXT: movw %dx, (%eax)
826776 ; X32-SSE1-NEXT: movb %cl, 3(%eax)
827777 ; X32-SSE1-NEXT: movb $0, 15(%eax)
828 ; X32-SSE1-NEXT: movb $0, 14(%eax)
829 ; X32-SSE1-NEXT: movb $0, 13(%eax)
830 ; X32-SSE1-NEXT: movb $0, 7(%eax)
831 ; X32-SSE1-NEXT: movb $0, 6(%eax)
778 ; X32-SSE1-NEXT: movw $0, 13(%eax)
779 ; X32-SSE1-NEXT: movw $0, 6(%eax)
832780 ; X32-SSE1-NEXT: retl $4
833781 ;
834782 ; X32-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
866814 ;
867815 ; X32-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
868816 ; X32-SSE1: # BB#0:
869 ; X32-SSE1-NEXT: pushl %ebx
870 ; X32-SSE1-NEXT: .Lcfi19:
871 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
872 ; X32-SSE1-NEXT: pushl %eax
873 ; X32-SSE1-NEXT: .Lcfi20:
874 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
875 ; X32-SSE1-NEXT: .Lcfi21:
876 ; X32-SSE1-NEXT: .cfi_offset %ebx, -8
877 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
878 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
879 ; X32-SSE1-NEXT: movb (%ecx), %dl
880 ; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
881 ; X32-SSE1-NEXT: movb 1(%ecx), %dh
882 ; X32-SSE1-NEXT: movb 2(%ecx), %bl
883 ; X32-SSE1-NEXT: movb 3(%ecx), %bh
884 ; X32-SSE1-NEXT: movb 6(%ecx), %dl
885 ; X32-SSE1-NEXT: movb 7(%ecx), %cl
886 ; X32-SSE1-NEXT: movb %cl, 7(%eax)
887 ; X32-SSE1-NEXT: movb %dl, 6(%eax)
888 ; X32-SSE1-NEXT: movb %bh, 3(%eax)
889 ; X32-SSE1-NEXT: movb %bl, 2(%eax)
890 ; X32-SSE1-NEXT: movb %dh, 1(%eax)
891 ; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
892 ; X32-SSE1-NEXT: movb %cl, (%eax)
817 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
818 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
819 ; X32-SSE1-NEXT: movl (%ecx), %edx
820 ; X32-SSE1-NEXT: movzwl 6(%ecx), %ecx
821 ; X32-SSE1-NEXT: movw %cx, 6(%eax)
822 ; X32-SSE1-NEXT: movl %edx, (%eax)
893823 ; X32-SSE1-NEXT: movb $0, 15(%eax)
894 ; X32-SSE1-NEXT: movb $0, 14(%eax)
895 ; X32-SSE1-NEXT: movb $0, 13(%eax)
896 ; X32-SSE1-NEXT: addl $4, %esp
897 ; X32-SSE1-NEXT: popl %ebx
824 ; X32-SSE1-NEXT: movw $0, 13(%eax)
898825 ; X32-SSE1-NEXT: retl $4
899826 ;
900827 ; X32-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
989916 ; X32-SSE1-LABEL: merge_2i64_i64_12_volatile:
990917 ; X32-SSE1: # BB#0:
991918 ; X32-SSE1-NEXT: pushl %edi
992 ; X32-SSE1-NEXT: .Lcfi22:
919 ; X32-SSE1-NEXT: .Lcfi18:
993920 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
994921 ; X32-SSE1-NEXT: pushl %esi
995 ; X32-SSE1-NEXT: .Lcfi23:
922 ; X32-SSE1-NEXT: .Lcfi19:
996923 ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
997 ; X32-SSE1-NEXT: .Lcfi24:
924 ; X32-SSE1-NEXT: .Lcfi20:
998925 ; X32-SSE1-NEXT: .cfi_offset %esi, -12
999 ; X32-SSE1-NEXT: .Lcfi25:
926 ; X32-SSE1-NEXT: .Lcfi21:
1000927 ; X32-SSE1-NEXT: .cfi_offset %edi, -8
1001928 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1002929 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1212 ; X86-NEXT: movb [[HI1]], 3([[BASEREG]])
1313 ; X86-NEXT: retq
1414
15 ; DBGDAG-LABEL: Optimized lowered selection DAG: BB#0 'merge_store_partial_overlap_load:'
15 ; DBGDAG-LABEL: Optimized legalized selection DAG: BB#0 'merge_store_partial_overlap_load:'
1616 ; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken
1717 ; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]],
1818 ; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add [[BASEPTR]], Constant:i64<2>
2626
2727 ; DBGDAG: X86ISD::RET_FLAG t{{[0-9]+}},
2828
29 ; DBGDAG: Type-legalized selection DAG: BB#0 'merge_store_partial_overlap_load:'
29 ; DBGDAG-LABEL: Instruction selection begins
3030 define void @merge_store_partial_overlap_load([4 x i8]* %tmp) {
3131 %tmp8 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 0
3232 %tmp10 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 1
44 define <16 x i8> @PR27973() {
55 ; CHECK-LABEL: PR27973:
66 ; CHECK: # BB#0:
7 ; CHECK-NEXT: movb $0, 15(%rdi)
8 ; CHECK-NEXT: movb $0, 14(%rdi)
9 ; CHECK-NEXT: movb $0, 13(%rdi)
10 ; CHECK-NEXT: movb $0, 12(%rdi)
11 ; CHECK-NEXT: movb $0, 11(%rdi)
12 ; CHECK-NEXT: movb $0, 10(%rdi)
13 ; CHECK-NEXT: movb $0, 9(%rdi)
14 ; CHECK-NEXT: movb $0, 8(%rdi)
15 ; CHECK-NEXT: movb $0, 7(%rdi)
16 ; CHECK-NEXT: movb $0, 6(%rdi)
17 ; CHECK-NEXT: movb $0, 5(%rdi)
18 ; CHECK-NEXT: movb $0, 4(%rdi)
19 ; CHECK-NEXT: movb $0, 3(%rdi)
20 ; CHECK-NEXT: movb $0, 2(%rdi)
21 ; CHECK-NEXT: movb $0, 1(%rdi)
22 ; CHECK-NEXT: movb $0, (%rdi)
7 ; CHECK-NEXT: movq $0, 8(%rdi)
8 ; CHECK-NEXT: movq $0, (%rdi)
239 ; CHECK-NEXT: movq %rdi, %rax
2410 ; CHECK-NEXT: retq
2511 %t0 = zext <16 x i8> zeroinitializer to <16 x i32>
1313 ;
1414 ; CHECK-DAG: movq {{.*}}, 192(%rsp)
1515 ; CHECK-DAG: movq {{.*}}, 184(%rsp)
16 ; CHECK-DAG: movl {{.*}}, 180(%rsp)
17 ; CHECK-DAG: movl {{.*}}, 176(%rsp)
16 ; CHECK-DAG: movq {{.*}}, 176(%rsp)
1817 %ap3 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0; <%struct.__va_list_tag*> [#uses=1]
1918 call void @bar(%struct.__va_list_tag* %ap3) nounwind
2019 call void @llvm.va_end(i8* %ap12)
1212 ;; the same result in memory in the end.
1313
1414 ; CHECK-LABEL: redundant_stores_merging:
15 ; CHECK: movabsq $528280977409, %rax
16 ; CHECK: movq %rax, e+4(%rip)
17 ; CHECK: movl $456, e+8(%rip)
15 ; CHECK: movabsq $1958505086977, %rax
16 ; CHECK: movq %rax, e+4(%rip)
1817 define void @redundant_stores_merging() {
1918 entry:
2019 store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
2524
2625 ;; This variant tests PR25154.
2726 ; CHECK-LABEL: redundant_stores_merging_reverse:
28 ; CHECK: movabsq $528280977409, %rax
29 ; CHECK: movq %rax, e+4(%rip)
30 ; CHECK: movl $456, e+8(%rip)
27 ; CHECK: movabsq $1958505086977, %rax
28 ; CHECK: movq %rax, e+4(%rip)
3129 define void @redundant_stores_merging_reverse() {
3230 entry:
3331 store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4