llvm.org GIT mirror llvm / 8f57a78
[DAG] Rewrite areNonVolatileConsecutiveLoads to use BaseIndexOffset As discussed in D34087, rewrite areNonVolatileConsecutiveLoads using generic checks. Also, propagate missing local handling from there to BaseIndexOffset checks. Tests of note: * test/CodeGen/X86/build-vector* - Improved. * test/CodeGen/BPF/undef.ll - Improved store alignment allows an additional store merge * test/CodeGen/X86/clear_upper_vector_element_bits.ll - This is a case we already do not handle well. Here, the DAG is improved, but scheduling causes a code size degradation. Reviewers: RKSimon, craig.topper, spatel, andreadb, filcab Subscribers: nemanjai, llvm-commits Differential Revision: https://reviews.llvm.org/D34472 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306819 91177308-0d34-0410-b5e6-96231b3b80d8 Nirav Dave 3 years ago
10 changed file(s) with 185 addition(s) and 256 deletion(s). Raw diff Collapse all Expand all
3333 #include "llvm/CodeGen/MachineMemOperand.h"
3434 #include "llvm/CodeGen/MachineValueType.h"
3535 #include "llvm/CodeGen/RuntimeLibcalls.h"
36 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
3637 #include "llvm/CodeGen/SelectionDAGNodes.h"
3738 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
3839 #include "llvm/CodeGen/ValueTypes.h"
76297630
76307631 SDValue Loc = LD->getOperand(1);
76317632 SDValue BaseLoc = Base->getOperand(1);
7632 if (Loc.getOpcode() == ISD::FrameIndex) {
7633 if (BaseLoc.getOpcode() != ISD::FrameIndex)
7634 return false;
7635 const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
7636 int FI = cast(Loc)->getIndex();
7637 int BFI = cast(BaseLoc)->getIndex();
7638 int FS = MFI.getObjectSize(FI);
7639 int BFS = MFI.getObjectSize(BFI);
7640 if (FS != BFS || FS != (int)Bytes) return false;
7641 return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
7642 }
7643
7644 // Handle X + C.
7645 if (isBaseWithConstantOffset(Loc)) {
7646 int64_t LocOffset = cast(Loc.getOperand(1))->getSExtValue();
7647 if (Loc.getOperand(0) == BaseLoc) {
7648 // If the base location is a simple address with no offset itself, then
7649 // the second load's first add operand should be the base address.
7650 if (LocOffset == Dist * (int)Bytes)
7651 return true;
7652 } else if (isBaseWithConstantOffset(BaseLoc)) {
7653 // The base location itself has an offset, so subtract that value from the
7654 // second load's offset before comparing to distance * size.
7655 int64_t BOffset =
7656 cast(BaseLoc.getOperand(1))->getSExtValue();
7657 if (Loc.getOperand(0) == BaseLoc.getOperand(0)) {
7658 if ((LocOffset - BOffset) == Dist * (int)Bytes)
7659 return true;
7660 }
7661 }
7662 }
7663 const GlobalValue *GV1 = nullptr;
7664 const GlobalValue *GV2 = nullptr;
7665 int64_t Offset1 = 0;
7666 int64_t Offset2 = 0;
7667 bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1);
7668 bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
7669 if (isGA1 && isGA2 && GV1 == GV2)
7670 return Offset1 == (Offset2 + Dist*Bytes);
7633
7634 auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this);
7635 auto LocDecomp = BaseIndexOffset::match(Loc, *this);
7636
7637 int64_t Offset = 0;
7638 if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
7639 return (Dist * Bytes == Offset);
76717640 return false;
76727641 }
76737642
5959 int64_t Offset = 0;
6060 bool IsIndexSignExt = false;
6161
62 // Consume constant adds
63 while (Base->getOpcode() == ISD::ADD &&
64 isa(Base->getOperand(1))) {
65 int64_t POffset = cast(Base->getOperand(1))->getSExtValue();
66 Offset += POffset;
67 Base = Base->getOperand(0);
62 // Consume constant adds & ors with appropriate masking.
63 while (Base->getOpcode() == ISD::ADD || Base->getOpcode() == ISD::OR) {
64 if (auto *C = dyn_cast(Base->getOperand(1))) {
65 // Only consider ORs which act as adds.
66 if (Base->getOpcode() == ISD::OR &&
67 !DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue()))
68 break;
69 Offset += C->getSExtValue();
70 Base = Base->getOperand(0);
71 continue;
72 }
73 break;
6874 }
6975
7076 if (Base->getOpcode() == ISD::ADD) {
1212
1313 ; Function Attrs: nounwind uwtable
1414 define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 section "socket1" {
15 ; CHECK: r2 = r10
16 ; CHECK: r2 += -2
17 ; CHECK: r1 = 0
18 ; CHECK: *(u16 *)(r2 + 6) = r1
19 ; CHECK: *(u16 *)(r2 + 4) = r1
20 ; CHECK: *(u16 *)(r2 + 2) = r1
21 ; CHECK: r2 = 6
22 ; CHECK: *(u8 *)(r10 - 7) = r2
23 ; CHECK: r2 = 5
24 ; CHECK: *(u8 *)(r10 - 8) = r2
25 ; CHECK: r2 = 7
26 ; CHECK: *(u8 *)(r10 - 6) = r2
27 ; CHECK: r2 = 8
28 ; CHECK: *(u8 *)(r10 - 5) = r2
29 ; CHECK: r2 = 9
30 ; CHECK: *(u8 *)(r10 - 4) = r2
31 ; CHECK: r2 = 10
32 ; CHECK: *(u8 *)(r10 - 3) = r2
33 ; CHECK: *(u16 *)(r10 + 24) = r1
34 ; CHECK: *(u16 *)(r10 + 22) = r1
35 ; CHECK: *(u16 *)(r10 + 20) = r1
36 ; CHECK: *(u16 *)(r10 + 18) = r1
37 ; CHECK: *(u16 *)(r10 + 16) = r1
38 ; CHECK: *(u16 *)(r10 + 14) = r1
39 ; CHECK: *(u16 *)(r10 + 12) = r1
40 ; CHECK: *(u16 *)(r10 + 10) = r1
41 ; CHECK: *(u16 *)(r10 + 8) = r1
42 ; CHECK: *(u16 *)(r10 + 6) = r1
43 ; CHECK: *(u16 *)(r10 - 2) = r1
44 ; CHECK: *(u16 *)(r10 + 26) = r1
15 ; CHECK: r1 = r10
16 ; CHECK: r1 += -2
17 ; CHECK: r2 = 0
18 ; CHECK: *(u16 *)(r1 + 6) = r2
19 ; CHECK: *(u16 *)(r1 + 4) = r2
20 ; CHECK: *(u16 *)(r1 + 2) = r2
21 ; CHECK: r1 = 134678021
22 ; CHECK: *(u32 *)(r10 - 8) = r1
23 ; CHECK: r1 = 9
24 ; CHECK: *(u8 *)(r10 - 4) = r1
25 ; CHECK: r1 = 10
26 ; CHECK: *(u8 *)(r10 - 3) = r1
27 ; CHECK: *(u16 *)(r10 + 24) = r2
28 ; CHECK: *(u16 *)(r10 + 22) = r2
29 ; CHECK: *(u16 *)(r10 + 20) = r2
30 ; CHECK: *(u16 *)(r10 + 18) = r2
31 ; CHECK: *(u16 *)(r10 + 16) = r2
32 ; CHECK: *(u16 *)(r10 + 14) = r2
33 ; CHECK: *(u16 *)(r10 + 12) = r2
34 ; CHECK: *(u16 *)(r10 + 10) = r2
35 ; CHECK: *(u16 *)(r10 + 8) = r2
36 ; CHECK: *(u16 *)(r10 + 6) = r2
37 ; CHECK: *(u16 *)(r10 - 2) = r2
38 ; CHECK: *(u16 *)(r10 + 26) = r2
4539 ; CHECK: r2 = r10
4640 ; CHECK: r2 += -8
4741 ; CHECK: r1 = ll
6363 %0 = load i16, i16* %retval ; [#uses=1]
6464 ret i16 %0
6565 ; CHECK-LABEL: mov2:
66 ; CHECK: mov.w 0(r1), 4(r1)
67 ; CHECK: mov.w 2(r1), 6(r1)
66 ; CHECK-DAG: mov.w 2(r1), 6(r1)
67 ; CHECK-DAG: mov.w 0(r1), 4(r1)
6868 }
7070 ; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %edi
7171 ; X86-MOVBE-NEXT: movbel %esi, 12(%eax)
7272 ; X86-MOVBE-NEXT: movbel %edi, 8(%eax)
73 ; X86-MOVBE-NEXT: movbel %ecx, 4(%eax)
74 ; X86-MOVBE-NEXT: movbel %edx, (%eax)
73 ; X86-MOVBE-NEXT: movbel %edx, 4(%eax)
74 ; X86-MOVBE-NEXT: movbel %ecx, (%eax)
7575 ; X86-MOVBE-NEXT: popl %esi
7676 ; X86-MOVBE-NEXT: popl %edi
7777 ; X86-MOVBE-NEXT: retl $4
7171 }
7272
7373 define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
74 ; SSE2-32-LABEL: test_buildvector_v2i64:
75 ; SSE2-32: # BB#0:
76 ; SSE2-32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
77 ; SSE2-32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
78 ; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
79 ; SSE2-32-NEXT: retl
74 ; SSE-32-LABEL: test_buildvector_v2i64:
75 ; SSE-32: # BB#0:
76 ; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
77 ; SSE-32-NEXT: retl
8078 ;
8179 ; SSE-64-LABEL: test_buildvector_v2i64:
8280 ; SSE-64: # BB#0:
8583 ; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
8684 ; SSE-64-NEXT: retq
8785 ;
88 ; SSE41-32-LABEL: test_buildvector_v2i64:
89 ; SSE41-32: # BB#0:
90 ; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
91 ; SSE41-32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
92 ; SSE41-32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0
93 ; SSE41-32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
94 ; SSE41-32-NEXT: retl
95 ;
9686 ; AVX-32-LABEL: test_buildvector_v2i64:
9787 ; AVX-32: # BB#0:
98 ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
99 ; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
100 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
101 ; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
88 ; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
10289 ; AVX-32-NEXT: retl
10390 ;
10491 ; AVX-64-LABEL: test_buildvector_v2i64:
5050 }
5151
5252 define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
53 ; AVX1-32-LABEL: test_buildvector_v4i64:
54 ; AVX1-32: # BB#0:
55 ; AVX1-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
56 ; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
57 ; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
58 ; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
59 ; AVX1-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
60 ; AVX1-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
61 ; AVX1-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
62 ; AVX1-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
63 ; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
64 ; AVX1-32-NEXT: retl
53 ; AVX-32-LABEL: test_buildvector_v4i64:
54 ; AVX-32: # BB#0:
55 ; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0
56 ; AVX-32-NEXT: retl
6557 ;
6658 ; AVX1-64-LABEL: test_buildvector_v4i64:
6759 ; AVX1-64: # BB#0:
7365 ; AVX1-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
7466 ; AVX1-64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
7567 ; AVX1-64-NEXT: retq
76 ;
77 ; AVX2-32-LABEL: test_buildvector_v4i64:
78 ; AVX2-32: # BB#0:
79 ; AVX2-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
80 ; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
81 ; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
82 ; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
83 ; AVX2-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
84 ; AVX2-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
85 ; AVX2-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
86 ; AVX2-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
87 ; AVX2-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
88 ; AVX2-32-NEXT: retl
8968 ;
9069 ; AVX2-64-LABEL: test_buildvector_v4i64:
9170 ; AVX2-64: # BB#0:
7878 define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) {
7979 ; AVX-32-LABEL: test_buildvector_v8i64:
8080 ; AVX-32: # BB#0:
81 ; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
82 ; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
83 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
84 ; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
85 ; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
86 ; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
87 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
88 ; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
89 ; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
90 ; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
91 ; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
92 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
93 ; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
94 ; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
95 ; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
96 ; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
97 ; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
98 ; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
99 ; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
81 ; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0
10082 ; AVX-32-NEXT: retl
10183 ;
10284 ; AVX-64-LABEL: test_buildvector_v8i64:
10621062 ;
10631063 ; AVX1-LABEL: _clearupper32xi8b:
10641064 ; AVX1: # BB#0:
1065 ; AVX1-NEXT: pushq %rbp
1066 ; AVX1-NEXT: pushq %r15
10651067 ; AVX1-NEXT: pushq %r14
1068 ; AVX1-NEXT: pushq %r13
1069 ; AVX1-NEXT: pushq %r12
10661070 ; AVX1-NEXT: pushq %rbx
1067 ; AVX1-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp)
1068 ; AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
1069 ; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r14
1071 ; AVX1-NEXT: vmovq %xmm0, %rcx
1072 ; AVX1-NEXT: movq %rcx, %r8
1073 ; AVX1-NEXT: movq %rcx, %r9
1074 ; AVX1-NEXT: movq %rcx, %r10
1075 ; AVX1-NEXT: movq %rcx, %r11
1076 ; AVX1-NEXT: movq %rcx, %r14
1077 ; AVX1-NEXT: movq %rcx, %r15
10701078 ; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
1071 ; AVX1-NEXT: movq %rdx, %r8
1072 ; AVX1-NEXT: movq %rdx, %r9
1073 ; AVX1-NEXT: movq %rdx, %r11
1079 ; AVX1-NEXT: movq %rdx, %r12
1080 ; AVX1-NEXT: movq %rdx, %r13
1081 ; AVX1-NEXT: movq %rdx, %rbx
1082 ; AVX1-NEXT: movq %rdx, %rax
1083 ; AVX1-NEXT: movq %rdx, %rdi
10741084 ; AVX1-NEXT: movq %rdx, %rsi
1075 ; AVX1-NEXT: movq %rdx, %rdi
1076 ; AVX1-NEXT: movq %rdx, %rcx
1077 ; AVX1-NEXT: movq %rdx, %rax
1085 ; AVX1-NEXT: movq %rdx, %rbp
10781086 ; AVX1-NEXT: andb $15, %dl
10791087 ; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
1080 ; AVX1-NEXT: shrq $56, %rax
1081 ; AVX1-NEXT: andb $15, %al
1082 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1083 ; AVX1-NEXT: movq %r14, %r10
1084 ; AVX1-NEXT: shrq $48, %rcx
1088 ; AVX1-NEXT: movq %rcx, %rdx
10851089 ; AVX1-NEXT: andb $15, %cl
10861090 ; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
1087 ; AVX1-NEXT: movq %r14, %rdx
1091 ; AVX1-NEXT: shrq $56, %rbp
1092 ; AVX1-NEXT: andb $15, %bpl
1093 ; AVX1-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
1094 ; AVX1-NEXT: shrq $48, %rsi
1095 ; AVX1-NEXT: andb $15, %sil
1096 ; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
10881097 ; AVX1-NEXT: shrq $40, %rdi
10891098 ; AVX1-NEXT: andb $15, %dil
10901099 ; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1091 ; AVX1-NEXT: movq %r14, %rax
1092 ; AVX1-NEXT: shrq $32, %rsi
1093 ; AVX1-NEXT: andb $15, %sil
1094 ; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
1095 ; AVX1-NEXT: movq %r14, %rcx
1096 ; AVX1-NEXT: shrq $24, %r11
1100 ; AVX1-NEXT: shrq $32, %rax
1101 ; AVX1-NEXT: andb $15, %al
1102 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1103 ; AVX1-NEXT: shrq $24, %rbx
1104 ; AVX1-NEXT: andb $15, %bl
1105 ; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
1106 ; AVX1-NEXT: shrq $16, %r13
1107 ; AVX1-NEXT: andb $15, %r13b
1108 ; AVX1-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
1109 ; AVX1-NEXT: shrq $8, %r12
1110 ; AVX1-NEXT: andb $15, %r12b
1111 ; AVX1-NEXT: movb %r12b, -{{[0-9]+}}(%rsp)
1112 ; AVX1-NEXT: shrq $8, %r8
1113 ; AVX1-NEXT: shrq $16, %r9
1114 ; AVX1-NEXT: shrq $24, %r10
1115 ; AVX1-NEXT: shrq $32, %r11
1116 ; AVX1-NEXT: shrq $40, %r14
1117 ; AVX1-NEXT: shrq $48, %r15
1118 ; AVX1-NEXT: shrq $56, %rdx
1119 ; AVX1-NEXT: andb $15, %dl
1120 ; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
1121 ; AVX1-NEXT: andb $15, %r15b
1122 ; AVX1-NEXT: movb %r15b, -{{[0-9]+}}(%rsp)
1123 ; AVX1-NEXT: andb $15, %r14b
1124 ; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
10971125 ; AVX1-NEXT: andb $15, %r11b
10981126 ; AVX1-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
1099 ; AVX1-NEXT: movq %r14, %rsi
1100 ; AVX1-NEXT: shrq $16, %r9
1127 ; AVX1-NEXT: andb $15, %r10b
1128 ; AVX1-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
11011129 ; AVX1-NEXT: andb $15, %r9b
11021130 ; AVX1-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
1103 ; AVX1-NEXT: movq %r14, %rdi
1104 ; AVX1-NEXT: shrq $8, %r8
11051131 ; AVX1-NEXT: andb $15, %r8b
11061132 ; AVX1-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
1107 ; AVX1-NEXT: movq %r14, %rbx
1108 ; AVX1-NEXT: andb $15, %r14b
1109 ; AVX1-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
1110 ; AVX1-NEXT: shrq $8, %r10
1111 ; AVX1-NEXT: shrq $16, %rdx
1112 ; AVX1-NEXT: shrq $24, %rax
1113 ; AVX1-NEXT: shrq $32, %rcx
1114 ; AVX1-NEXT: shrq $40, %rsi
1115 ; AVX1-NEXT: shrq $48, %rdi
1116 ; AVX1-NEXT: shrq $56, %rbx
1117 ; AVX1-NEXT: andb $15, %bl
1118 ; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
1119 ; AVX1-NEXT: andb $15, %dil
1120 ; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1121 ; AVX1-NEXT: andb $15, %sil
1122 ; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
1123 ; AVX1-NEXT: andb $15, %cl
1124 ; AVX1-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
1125 ; AVX1-NEXT: andb $15, %al
1126 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1127 ; AVX1-NEXT: andb $15, %dl
1128 ; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
1129 ; AVX1-NEXT: andb $15, %r10b
1130 ; AVX1-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
11311133 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
11321134 ; AVX1-NEXT: vmovq %xmm0, %rax
1133 ; AVX1-NEXT: movq %rax, %r8
1135 ; AVX1-NEXT: movq %rax, %rcx
11341136 ; AVX1-NEXT: movq %rax, %rdx
11351137 ; AVX1-NEXT: movq %rax, %rsi
11361138 ; AVX1-NEXT: movq %rax, %rdi
1139 ; AVX1-NEXT: movl %eax, %ebp
11371140 ; AVX1-NEXT: movl %eax, %ebx
1138 ; AVX1-NEXT: movl %eax, %ecx
11391141 ; AVX1-NEXT: vmovd %eax, %xmm1
11401142 ; AVX1-NEXT: shrl $8, %eax
11411143 ; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
1142 ; AVX1-NEXT: shrl $16, %ecx
1143 ; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
1144 ; AVX1-NEXT: shrl $24, %ebx
1145 ; AVX1-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1
1144 ; AVX1-NEXT: shrl $16, %ebx
1145 ; AVX1-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1
1146 ; AVX1-NEXT: shrl $24, %ebp
1147 ; AVX1-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1
11461148 ; AVX1-NEXT: shrq $32, %rdi
11471149 ; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
11481150 ; AVX1-NEXT: shrq $40, %rsi
11521154 ; AVX1-NEXT: shrq $48, %rdx
11531155 ; AVX1-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1
11541156 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1155 ; AVX1-NEXT: shrq $56, %r8
1156 ; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
1157 ; AVX1-NEXT: shrq $56, %rcx
1158 ; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
11571159 ; AVX1-NEXT: movl %eax, %ecx
11581160 ; AVX1-NEXT: shrl $8, %ecx
11591161 ; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
12211223 ; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
12221224 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
12231225 ; AVX1-NEXT: popq %rbx
1226 ; AVX1-NEXT: popq %r12
1227 ; AVX1-NEXT: popq %r13
12241228 ; AVX1-NEXT: popq %r14
1229 ; AVX1-NEXT: popq %r15
1230 ; AVX1-NEXT: popq %rbp
12251231 ; AVX1-NEXT: retq
12261232 ;
12271233 ; AVX2-LABEL: _clearupper32xi8b:
12281234 ; AVX2: # BB#0:
1235 ; AVX2-NEXT: pushq %rbp
1236 ; AVX2-NEXT: pushq %r15
12291237 ; AVX2-NEXT: pushq %r14
1238 ; AVX2-NEXT: pushq %r13
1239 ; AVX2-NEXT: pushq %r12
12301240 ; AVX2-NEXT: pushq %rbx
1231 ; AVX2-NEXT: vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp)
1232 ; AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
1233 ; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r14
1241 ; AVX2-NEXT: vmovq %xmm0, %rcx
1242 ; AVX2-NEXT: movq %rcx, %r8
1243 ; AVX2-NEXT: movq %rcx, %r9
1244 ; AVX2-NEXT: movq %rcx, %r10
1245 ; AVX2-NEXT: movq %rcx, %r11
1246 ; AVX2-NEXT: movq %rcx, %r14
1247 ; AVX2-NEXT: movq %rcx, %r15
12341248 ; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
1235 ; AVX2-NEXT: movq %rdx, %r8
1236 ; AVX2-NEXT: movq %rdx, %r9
1237 ; AVX2-NEXT: movq %rdx, %r11
1249 ; AVX2-NEXT: movq %rdx, %r12
1250 ; AVX2-NEXT: movq %rdx, %r13
1251 ; AVX2-NEXT: movq %rdx, %rbx
1252 ; AVX2-NEXT: movq %rdx, %rax
1253 ; AVX2-NEXT: movq %rdx, %rdi
12381254 ; AVX2-NEXT: movq %rdx, %rsi
1239 ; AVX2-NEXT: movq %rdx, %rdi
1240 ; AVX2-NEXT: movq %rdx, %rcx
1241 ; AVX2-NEXT: movq %rdx, %rax
1255 ; AVX2-NEXT: movq %rdx, %rbp
12421256 ; AVX2-NEXT: andb $15, %dl
12431257 ; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
1244 ; AVX2-NEXT: shrq $56, %rax
1245 ; AVX2-NEXT: andb $15, %al
1246 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1247 ; AVX2-NEXT: movq %r14, %r10
1248 ; AVX2-NEXT: shrq $48, %rcx
1258 ; AVX2-NEXT: movq %rcx, %rdx
12491259 ; AVX2-NEXT: andb $15, %cl
12501260 ; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
1251 ; AVX2-NEXT: movq %r14, %rdx
1261 ; AVX2-NEXT: shrq $56, %rbp
1262 ; AVX2-NEXT: andb $15, %bpl
1263 ; AVX2-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
1264 ; AVX2-NEXT: shrq $48, %rsi
1265 ; AVX2-NEXT: andb $15, %sil
1266 ; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
12521267 ; AVX2-NEXT: shrq $40, %rdi
12531268 ; AVX2-NEXT: andb $15, %dil
12541269 ; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1255 ; AVX2-NEXT: movq %r14, %rax
1256 ; AVX2-NEXT: shrq $32, %rsi
1257 ; AVX2-NEXT: andb $15, %sil
1258 ; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
1259 ; AVX2-NEXT: movq %r14, %rcx
1260 ; AVX2-NEXT: shrq $24, %r11
1270 ; AVX2-NEXT: shrq $32, %rax
1271 ; AVX2-NEXT: andb $15, %al
1272 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1273 ; AVX2-NEXT: shrq $24, %rbx
1274 ; AVX2-NEXT: andb $15, %bl
1275 ; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
1276 ; AVX2-NEXT: shrq $16, %r13
1277 ; AVX2-NEXT: andb $15, %r13b
1278 ; AVX2-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
1279 ; AVX2-NEXT: shrq $8, %r12
1280 ; AVX2-NEXT: andb $15, %r12b
1281 ; AVX2-NEXT: movb %r12b, -{{[0-9]+}}(%rsp)
1282 ; AVX2-NEXT: shrq $8, %r8
1283 ; AVX2-NEXT: shrq $16, %r9
1284 ; AVX2-NEXT: shrq $24, %r10
1285 ; AVX2-NEXT: shrq $32, %r11
1286 ; AVX2-NEXT: shrq $40, %r14
1287 ; AVX2-NEXT: shrq $48, %r15
1288 ; AVX2-NEXT: shrq $56, %rdx
1289 ; AVX2-NEXT: andb $15, %dl
1290 ; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
1291 ; AVX2-NEXT: andb $15, %r15b
1292 ; AVX2-NEXT: movb %r15b, -{{[0-9]+}}(%rsp)
1293 ; AVX2-NEXT: andb $15, %r14b
1294 ; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
12611295 ; AVX2-NEXT: andb $15, %r11b
12621296 ; AVX2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
1263 ; AVX2-NEXT: movq %r14, %rsi
1264 ; AVX2-NEXT: shrq $16, %r9
1297 ; AVX2-NEXT: andb $15, %r10b
1298 ; AVX2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
12651299 ; AVX2-NEXT: andb $15, %r9b
12661300 ; AVX2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
1267 ; AVX2-NEXT: movq %r14, %rdi
1268 ; AVX2-NEXT: shrq $8, %r8
12691301 ; AVX2-NEXT: andb $15, %r8b
12701302 ; AVX2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
1271 ; AVX2-NEXT: movq %r14, %rbx
1272 ; AVX2-NEXT: andb $15, %r14b
1273 ; AVX2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
1274 ; AVX2-NEXT: shrq $8, %r10
1275 ; AVX2-NEXT: shrq $16, %rdx
1276 ; AVX2-NEXT: shrq $24, %rax
1277 ; AVX2-NEXT: shrq $32, %rcx
1278 ; AVX2-NEXT: shrq $40, %rsi
1279 ; AVX2-NEXT: shrq $48, %rdi
1280 ; AVX2-NEXT: shrq $56, %rbx
1281 ; AVX2-NEXT: andb $15, %bl
1282 ; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
1283 ; AVX2-NEXT: andb $15, %dil
1284 ; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
1285 ; AVX2-NEXT: andb $15, %sil
1286 ; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
1287 ; AVX2-NEXT: andb $15, %cl
1288 ; AVX2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
1289 ; AVX2-NEXT: andb $15, %al
1290 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1291 ; AVX2-NEXT: andb $15, %dl
1292 ; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
1293 ; AVX2-NEXT: andb $15, %r10b
1294 ; AVX2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
12951303 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
12961304 ; AVX2-NEXT: vmovq %xmm0, %rax
1297 ; AVX2-NEXT: movq %rax, %r8
1305 ; AVX2-NEXT: movq %rax, %rcx
12981306 ; AVX2-NEXT: movq %rax, %rdx
12991307 ; AVX2-NEXT: movq %rax, %rsi
13001308 ; AVX2-NEXT: movq %rax, %rdi
1309 ; AVX2-NEXT: movl %eax, %ebp
13011310 ; AVX2-NEXT: movl %eax, %ebx
1302 ; AVX2-NEXT: movl %eax, %ecx
13031311 ; AVX2-NEXT: vmovd %eax, %xmm1
13041312 ; AVX2-NEXT: shrl $8, %eax
13051313 ; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
1306 ; AVX2-NEXT: shrl $16, %ecx
1307 ; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
1308 ; AVX2-NEXT: shrl $24, %ebx
1309 ; AVX2-NEXT: vpinsrb $3, %ebx, %xmm1, %xmm1
1314 ; AVX2-NEXT: shrl $16, %ebx
1315 ; AVX2-NEXT: vpinsrb $2, %ebx, %xmm1, %xmm1
1316 ; AVX2-NEXT: shrl $24, %ebp
1317 ; AVX2-NEXT: vpinsrb $3, %ebp, %xmm1, %xmm1
13101318 ; AVX2-NEXT: shrq $32, %rdi
13111319 ; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
13121320 ; AVX2-NEXT: shrq $40, %rsi
13161324 ; AVX2-NEXT: shrq $48, %rdx
13171325 ; AVX2-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1
13181326 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1319 ; AVX2-NEXT: shrq $56, %r8
1320 ; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
1327 ; AVX2-NEXT: shrq $56, %rcx
1328 ; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
13211329 ; AVX2-NEXT: movl %eax, %ecx
13221330 ; AVX2-NEXT: shrl $8, %ecx
13231331 ; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
13851393 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
13861394 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
13871395 ; AVX2-NEXT: popq %rbx
1396 ; AVX2-NEXT: popq %r12
1397 ; AVX2-NEXT: popq %r13
13881398 ; AVX2-NEXT: popq %r14
1399 ; AVX2-NEXT: popq %r15
1400 ; AVX2-NEXT: popq %rbp
13891401 ; AVX2-NEXT: retq
13901402 %x4 = bitcast <32 x i8> %0 to <64 x i4>
13911403 %r0 = insertelement <64 x i4> %x4, i4 zeroinitializer, i32 1
100100 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
101101 ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %edx
102102 ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %esi
103 ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
103104 ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax
104 ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
105105 ; CHECK-NEXT: jge .LBB4_2
106106 ; CHECK-NEXT: # BB#1: # %bb1
107107 ; CHECK-NEXT: movl $1, %eax