llvm.org GIT mirror llvm / d1cee9b
Fix large stack alignment codegen for ARM and Thumb2 targets This partially fixes PR13007 (ARM CodeGen fails with large stack alignment): for ARM and Thumb2 targets, but not for Thumb1, as it seems stack alignment for Thumb1 targets hasn't been supported at all. Producing an aligned stack pointer is done by zero-ing out the lower bits of the stack pointer. The BIC instruction was used for this. However, the immediate field of the BIC instruction only allows to encode an immediate that can zero out up to a maximum of the 8 lower bits. When a larger alignment is requested, a BIC instruction cannot be used; llvm was silently producing incorrect code in this case. This commit fixes code generation for large stack aligments by using the BFC instruction instead, when the BFC instruction is available. When not, it uses 2 instructions: a right shift, followed by a left shift to zero out the lower bits. The lowering of ARM::Int_eh_sjlj_dispatchsetup still has code that unconditionally uses BIC to realign the stack pointer, so it very likely has the same problem. However, I wasn't able to produce a test case for that. This commit adds an assert so that the compiler will fail the assert instead of silently generating wrong code if this is ever reached. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225446 91177308-0d34-0410-b5e6-96231b3b80d8 Kristof Beyls 5 years ago
9 changed file(s) with 264 addition(s) and 38 deletion(s). Raw diff Collapse all Expand all
886886 unsigned MaxAlign = MFI->getMaxAlignment();
887887 assert (!AFI->isThumb1OnlyFunction());
888888 // Emit bic r6, r6, MaxAlign
889 assert(MaxAlign <= 256 && "The BIC instruction cannot encode "
890 "immediates larger than 256 with all lower "
891 "bits set.");
889892 unsigned bicOpc = AFI->isThumbFunction() ?
890893 ARM::t2BICri : ARM::BICri;
891894 AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
208208 }
209209 }
210210 };
211 }
212
213 /// Emit an instruction sequence that will align the address in
214 /// register Reg by zero-ing out the lower bits. For versions of the
215 /// architecture that support Neon, this must be done in a single
216 /// instruction, since skipAlignedDPRCS2Spills assumes it is done in a
217 /// single instruction. That function only gets called when optimizing
218 /// spilling of D registers on a core with the Neon instruction set
219 /// present.
220 static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
221 const TargetInstrInfo &TII,
222 MachineBasicBlock &MBB,
223 MachineBasicBlock::iterator MBBI,
224 DebugLoc DL, const unsigned Reg,
225 const unsigned Alignment,
226 const bool MustBeSingleInstruction) {
227 const ARMSubtarget &AST = MF.getTarget().getSubtarget();
228 const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops();
229 const unsigned AlignMask = Alignment - 1;
230 const unsigned NrBitsToZero = countTrailingZeros(Alignment);
231 assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported");
232 if (!AFI->isThumbFunction()) {
233 // if the BFC instruction is available, use that to zero the lower
234 // bits:
235 // bfc Reg, #0, log2(Alignment)
236 // otherwise use BIC, if the mask to zero the required number of bits
237 // can be encoded in the bic immediate field
238 // bic Reg, Reg, Alignment-1
239 // otherwise, emit
240 // lsr Reg, Reg, log2(Alignment)
241 // lsl Reg, Reg, log2(Alignment)
242 if (CanUseBFC) {
243 AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
244 .addReg(Reg, RegState::Kill)
245 .addImm(~AlignMask));
246 } else if (AlignMask <= 255) {
247 AddDefaultCC(
248 AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
249 .addReg(Reg, RegState::Kill)
250 .addImm(AlignMask)));
251 } else {
252 assert(!MustBeSingleInstruction &&
253 "Shouldn't call emitAligningInstructions demanding a single "
254 "instruction to be emitted for large stack alignment for a target "
255 "without BFC.");
256 AddDefaultCC(AddDefaultPred(
257 BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
258 .addReg(Reg, RegState::Kill)
259 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))));
260 AddDefaultCC(AddDefaultPred(
261 BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
262 .addReg(Reg, RegState::Kill)
263 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))));
264 }
265 } else {
266 // Since this is only reached for Thumb-2 targets, the BFC instruction
267 // should always be available.
268 assert(CanUseBFC);
269 AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
270 .addReg(Reg, RegState::Kill)
271 .addImm(~AlignMask));
272 }
211273 }
212274
213275 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
567629 // realigned.
568630 if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
569631 unsigned MaxAlign = MFI->getMaxAlignment();
570 assert (!AFI->isThumb1OnlyFunction());
632 assert(!AFI->isThumb1OnlyFunction());
571633 if (!AFI->isThumbFunction()) {
572 // Emit bic sp, sp, MaxAlign
573 AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
574 TII.get(ARM::BICri), ARM::SP)
575 .addReg(ARM::SP, RegState::Kill)
576 .addImm(MaxAlign-1)));
634 emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
635 false);
577636 } else {
578 // We cannot use sp as source/dest register here, thus we're emitting the
579 // following sequence:
637 // We cannot use sp as source/dest register here, thus we're using r4 to
638 // perform the calculations. We're emitting the following sequence:
580639 // mov r4, sp
581 // bic r4, r4, MaxAlign
640 // -- use emitAligningInstructions to produce best sequence to zero
641 // -- out lower bits in r4
582642 // mov sp, r4
583643 // FIXME: It will be better just to find spare register here.
584644 AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
585 .addReg(ARM::SP, RegState::Kill));
586 AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
587 TII.get(ARM::t2BICri), ARM::R4)
588 .addReg(ARM::R4, RegState::Kill)
589 .addImm(MaxAlign-1)));
645 .addReg(ARM::SP, RegState::Kill));
646 emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
647 false);
590648 AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
591 .addReg(ARM::R4, RegState::Kill));
649 .addReg(ARM::R4, RegState::Kill));
592650 }
593651
594652 AFI->setShouldRestoreSPFromFP(true);
10831141 // The immediate is <= 64, so it doesn't need any special encoding.
10841142 unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
10851143 AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
1086 .addReg(ARM::SP)
1087 .addImm(8 * NumAlignedDPRCS2Regs)));
1088
1089 // bic r4, r4, #align-1
1090 Opc = isThumb ? ARM::t2BICri : ARM::BICri;
1144 .addReg(ARM::SP)
1145 .addImm(8 * NumAlignedDPRCS2Regs)));
1146
10911147 unsigned MaxAlign = MF.getFrameInfo()->getMaxAlignment();
1092 AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
1093 .addReg(ARM::R4, RegState::Kill)
1094 .addImm(MaxAlign - 1)));
1148 // We must set parameter MustBeSingleInstruction to true, since
1149 // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform
1150 // stack alignment. Luckily, this can always be done since all ARM
1151 // architecture versions that support Neon also support the BFC
1152 // instruction.
1153 emitAligningInstructions(MF, AFI, TII, MBB, MI, DL, ARM::R4, MaxAlign, true);
10951154
10961155 // mov sp, r4
10971156 // The stack pointer must be adjusted before spilling anything, otherwise
4040 define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
4141 entry:
4242 ; REALIGN-LABEL: test2
43 ; REALIGN: bic sp, sp, #63
43 ; REALIGN: bfc sp, #0, #6
4444 ; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
4545 ; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
4646 ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
7070 ; CHECK-IOS-LABEL: check_vfp_fold:
7171 ; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
7272 ; CHECK-IOS: sub.w r4, sp, #16
73 ; CHECK-IOS: bic r4, r4, #15
73 ; CHECK-IOS: bfc r4, #0, #4
7474 ; CHECK-IOS: mov sp, r4
7575 ; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
7676 ; ...
1414 ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
1515 ; CHECK-A: add r11, sp, #20
1616 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
17 ; CHECK-A: bic sp, sp, #7
17 ; CHECK-A: bfc sp, #0, #3
1818 ; CHECK-A: bl bar
1919 ; CHECK-A: sub sp, r11, #20
2020 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
2424 ; CHECK-A-THUMB: push.w {r0, r1, r2, r3, r4, r7, r12, lr}
2525 ; CHECK-A-THUMB: add r7, sp, #20
2626 ; CHECK-A-THUMB: mov r4, sp
27 ; CHECK-A-THUMB: bic r4, r4, #7
27 ; CHECK-A-THUMB: bfc r4, #0, #3
2828 ; CHECK-A-THUMB: bl bar
2929 ; CHECK-A-THUMB: sub.w r4, r7, #20
3030 ; CHECK-A-THUMB: mov sp, r4
3737 ; CHECK-M: push.w {r4, r10, r11, lr}
3838 ; CHECK-M: add.w r11, sp, #8
3939 ; CHECK-M: mov r4, sp
40 ; CHECK-M: bic r4, r4, #7
40 ; CHECK-M: bfc r4, #0, #3
4141 ; CHECK-M: mov sp, r4
4242 ; CHECK-M: bl _bar
4343 ; CHECK-M: sub.w r4, r11, #8
5555 ; 32 to get past r0, r1, ..., r7
5656 ; CHECK-A: add r11, sp, #32
5757 ; CHECK-A: sub sp, sp, #{{[0-9]+}}
58 ; CHECK-A: bic sp, sp, #7
58 ; CHECK-A: bfc sp, #0, #3
5959 ; [...]
6060 ; 32 must match above
6161 ; CHECK-A: sub sp, r11, #32
7474 ; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
7575 ; CHECK-A: add r11, sp, #44
7676 ; CHECK-A: sub sp, sp, #{{[0-9]+}}
77 ; CHECK-A: bic sp, sp, #7
77 ; CHECK-A: bfc sp, #0, #3
7878 ; [...]
7979 ; CHECK-A: sub sp, r11, #44
8080 ; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
9090 ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
9191 ; CHECK-A: add r11, sp, #20
9292 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
93 ; CHECK-A: bic sp, sp, #7
93 ; CHECK-A: bfc sp, #0, #3
9494 ; [...]
9595 ; CHECK-A: sub sp, r11, #20
9696 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
105105 ; CHECK-A: push {r0, r1, r2, r3, r10, r11, r12, lr}
106106 ; CHECK-A: add r11, sp, #20
107107 ; CHECK-A-NOT: sub sp, sp, #{{[0-9]+}}
108 ; CHECK-A: bic sp, sp, #7
108 ; CHECK-A: bfc sp, #0, #3
109109 ; [...]
110110 ; CHECK-A: sub sp, r11, #20
111111 ; CHECK-A: pop {r0, r1, r2, r3, r10, r11, r12, lr}
1010
1111 define void @aaa(%quuz* %this, i8* %block) {
1212 ; CHECK-LABEL: aaa:
13 ; CHECK: bic {{.*}}, #15
13 ; CHECK: bfc {{.*}}, #0, #4
1414 ; CHECK: vst1.64 {{.*}}sp:128
1515 ; CHECK: vld1.64 {{.*}}sp:128
1616 entry:
0 ; RUN: llc -verify-machineinstrs < %s -mtriple=armv4t | FileCheck %s -check-prefix=CHECK-v4A32
1 ; RUN: llc -verify-machineinstrs < %s -mtriple=armv7a | FileCheck %s -check-prefix=CHECK-v7A32
2 ; RUN: llc -verify-machineinstrs < %s -mtriple=thumbv7a | FileCheck %s -check-prefix=CHECK-THUMB2
3 ; FIXME: There are no tests for Thumb1 since dynamic stack alignment is not supported for
4 ; Thumb1.
5
6 define i32 @f_bic_can_be_used_align() nounwind {
7 entry:
8 ; CHECK-LABEL: f_bic_can_be_used_align:
9 ; CHECK-v7A32: bfc sp, #0, #8
10 ; CHECK-v4A32: bic sp, sp, #255
11 ; CHECK-THUMB2: mov r4, sp
12 ; CHECK-THUMB2-NEXT: bfc r4, #0, #8
13 ; CHECK-THUMB2-NEXT: mov sp, r4
14 %x = alloca i32, align 256
15 store volatile i32 0, i32* %x, align 256
16 ret i32 0
17 }
18
19 define i32 @f_too_large_for_bic_align() nounwind {
20 entry:
21 ; CHECK-LABEL: f_too_large_for_bic_align:
22 ; CHECK-v7A32: bfc sp, #0, #9
23 ; CHECK-v4A32: lsr sp, sp, #9
24 ; CHECK-v4A32: lsl sp, sp, #9
25 ; CHECK-THUMB2: mov r4, sp
26 ; CHECK-THUMB2-NEXT: bfc r4, #0, #9
27 ; CHECK-THUMB2-NEXT: mov sp, r4
28 %x = alloca i32, align 512
29 store volatile i32 0, i32* %x, align 512
30 ret i32 0
31 }
32
33 define i8* @f_alignedDPRCS2Spills(double* %d) #0 {
34 entry:
35 ; CHECK-LABEL: f_too_large_for_bic_align:
36 ; CHECK-v7A32: bfc sp, #0, #12
37 ; CHECK-v4A32: lsr sp, sp, #12
38 ; CHECK-v4A32: lsl sp, sp, #12
39 ; CHECK-THUMB2: bfc r4, #0, #12
40 ; CHECK-THUMB2-NEXT: mov sp, r4
41 %a = alloca i8, align 4096
42 %0 = load double* %d, align 4
43 %arrayidx1 = getelementptr inbounds double* %d, i32 1
44 %1 = load double* %arrayidx1, align 4
45 %arrayidx2 = getelementptr inbounds double* %d, i32 2
46 %2 = load double* %arrayidx2, align 4
47 %arrayidx3 = getelementptr inbounds double* %d, i32 3
48 %3 = load double* %arrayidx3, align 4
49 %arrayidx4 = getelementptr inbounds double* %d, i32 4
50 %4 = load double* %arrayidx4, align 4
51 %arrayidx5 = getelementptr inbounds double* %d, i32 5
52 %5 = load double* %arrayidx5, align 4
53 %arrayidx6 = getelementptr inbounds double* %d, i32 6
54 %6 = load double* %arrayidx6, align 4
55 %arrayidx7 = getelementptr inbounds double* %d, i32 7
56 %7 = load double* %arrayidx7, align 4
57 %arrayidx8 = getelementptr inbounds double* %d, i32 8
58 %8 = load double* %arrayidx8, align 4
59 %arrayidx9 = getelementptr inbounds double* %d, i32 9
60 %9 = load double* %arrayidx9, align 4
61 %arrayidx10 = getelementptr inbounds double* %d, i32 10
62 %10 = load double* %arrayidx10, align 4
63 %arrayidx11 = getelementptr inbounds double* %d, i32 11
64 %11 = load double* %arrayidx11, align 4
65 %arrayidx12 = getelementptr inbounds double* %d, i32 12
66 %12 = load double* %arrayidx12, align 4
67 %arrayidx13 = getelementptr inbounds double* %d, i32 13
68 %13 = load double* %arrayidx13, align 4
69 %arrayidx14 = getelementptr inbounds double* %d, i32 14
70 %14 = load double* %arrayidx14, align 4
71 %arrayidx15 = getelementptr inbounds double* %d, i32 15
72 %15 = load double* %arrayidx15, align 4
73 %arrayidx16 = getelementptr inbounds double* %d, i32 16
74 %16 = load double* %arrayidx16, align 4
75 %arrayidx17 = getelementptr inbounds double* %d, i32 17
76 %17 = load double* %arrayidx17, align 4
77 %arrayidx18 = getelementptr inbounds double* %d, i32 18
78 %18 = load double* %arrayidx18, align 4
79 %arrayidx19 = getelementptr inbounds double* %d, i32 19
80 %19 = load double* %arrayidx19, align 4
81 %arrayidx20 = getelementptr inbounds double* %d, i32 20
82 %20 = load double* %arrayidx20, align 4
83 %arrayidx21 = getelementptr inbounds double* %d, i32 21
84 %21 = load double* %arrayidx21, align 4
85 %arrayidx22 = getelementptr inbounds double* %d, i32 22
86 %22 = load double* %arrayidx22, align 4
87 %arrayidx23 = getelementptr inbounds double* %d, i32 23
88 %23 = load double* %arrayidx23, align 4
89 %arrayidx24 = getelementptr inbounds double* %d, i32 24
90 %24 = load double* %arrayidx24, align 4
91 %arrayidx25 = getelementptr inbounds double* %d, i32 25
92 %25 = load double* %arrayidx25, align 4
93 %arrayidx26 = getelementptr inbounds double* %d, i32 26
94 %26 = load double* %arrayidx26, align 4
95 %arrayidx27 = getelementptr inbounds double* %d, i32 27
96 %27 = load double* %arrayidx27, align 4
97 %arrayidx28 = getelementptr inbounds double* %d, i32 28
98 %28 = load double* %arrayidx28, align 4
99 %arrayidx29 = getelementptr inbounds double* %d, i32 29
100 %29 = load double* %arrayidx29, align 4
101 %div = fdiv double %29, %28
102 %div30 = fdiv double %div, %27
103 %div31 = fdiv double %div30, %26
104 %div32 = fdiv double %div31, %25
105 %div33 = fdiv double %div32, %24
106 %div34 = fdiv double %div33, %23
107 %div35 = fdiv double %div34, %22
108 %div36 = fdiv double %div35, %21
109 %div37 = fdiv double %div36, %20
110 %div38 = fdiv double %div37, %19
111 %div39 = fdiv double %div38, %18
112 %div40 = fdiv double %div39, %17
113 %div41 = fdiv double %div40, %16
114 %div42 = fdiv double %div41, %15
115 %div43 = fdiv double %div42, %14
116 %div44 = fdiv double %div43, %13
117 %div45 = fdiv double %div44, %12
118 %div46 = fdiv double %div45, %11
119 %div47 = fdiv double %div46, %10
120 %div48 = fdiv double %div47, %9
121 %div49 = fdiv double %div48, %8
122 %div50 = fdiv double %div49, %7
123 %div51 = fdiv double %div50, %6
124 %div52 = fdiv double %div51, %5
125 %div53 = fdiv double %div52, %4
126 %div54 = fdiv double %div53, %3
127 %div55 = fdiv double %div54, %2
128 %div56 = fdiv double %div55, %1
129 %div57 = fdiv double %div56, %0
130 %div58 = fdiv double %0, %1
131 %div59 = fdiv double %div58, %2
132 %div60 = fdiv double %div59, %3
133 %div61 = fdiv double %div60, %4
134 %div62 = fdiv double %div61, %5
135 %div63 = fdiv double %div62, %6
136 %div64 = fdiv double %div63, %7
137 %div65 = fdiv double %div64, %8
138 %div66 = fdiv double %div65, %9
139 %div67 = fdiv double %div66, %10
140 %div68 = fdiv double %div67, %11
141 %div69 = fdiv double %div68, %12
142 %div70 = fdiv double %div69, %13
143 %div71 = fdiv double %div70, %14
144 %div72 = fdiv double %div71, %15
145 %div73 = fdiv double %div72, %16
146 %div74 = fdiv double %div73, %17
147 %div75 = fdiv double %div74, %18
148 %div76 = fdiv double %div75, %19
149 %div77 = fdiv double %div76, %20
150 %div78 = fdiv double %div77, %21
151 %div79 = fdiv double %div78, %22
152 %div80 = fdiv double %div79, %23
153 %div81 = fdiv double %div80, %24
154 %div82 = fdiv double %div81, %25
155 %div83 = fdiv double %div82, %26
156 %div84 = fdiv double %div83, %27
157 %div85 = fdiv double %div84, %28
158 %div86 = fdiv double %div85, %29
159 %mul = fmul double %div57, %div86
160 %conv = fptosi double %mul to i32
161 %add.ptr = getelementptr inbounds i8* %a, i32 %conv
162 ret i8* %add.ptr
163 }
88 ;
99 ; The caller-saved r4 is used as a scratch register for stack realignment.
1010 ; CHECK: push {r4, r7, lr}
11 ; CHECK: bic r4, r4, #7
11 ; CHECK: bfc r4, #0, #3
1212 ; CHECK: mov sp, r4
1313 define void @f(double* nocapture %p) nounwind ssp {
1414 entry:
2222 ; NEON: f
2323 ; NEON: push {r4, r7, lr}
2424 ; NEON: sub.w r4, sp, #64
25 ; NEON: bic r4, r4, #15
25 ; NEON: bfc r4, #0, #4
2626 ; Stack pointer must be updated before the spills.
2727 ; NEON: mov sp, r4
2828 ; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]!
5353 ; NEON: f7
5454 ; NEON: push {r4, r7, lr}
5555 ; NEON: sub.w r4, sp, #56
56 ; NEON: bic r4, r4, #15
56 ; NEON: bfc r4, #0, #4
5757 ; Stack pointer must be updated before the spills.
5858 ; NEON: mov sp, r4
5959 ; NEON: vst1.64 {d8, d9, d10, d11}, [r4:128]!
8080 ; NEON: push {r4, r7, lr}
8181 ; NEON: vpush {d12, d13, d14, d15}
8282 ; NEON: sub.w r4, sp, #24
83 ; NEON: bic r4, r4, #15
83 ; NEON: bfc r4, #0, #4
8484 ; Stack pointer must be updated before the spills.
8585 ; NEON: mov sp, r4
8686 ; NEON: vst1.64 {d8, d9}, [r4:128]
1010
1111 define void @aaa(%quuz* %this, i8* %block) {
1212 ; CHECK-LABEL: aaa:
13 ; CHECK: bic r4, r4, #15
13 ; CHECK: bfc r4, #0, #4
1414 ; CHECK: vst1.64 {{.*}}[{{.*}}:128]
1515 ; CHECK: vld1.64 {{.*}}[{{.*}}:128]
1616 entry: