llvm.org GIT mirror llvm / 7d24705
Change register allocation order for ARM VFP and NEON registers to put the callee-saved registers at the end of the lists. Also prefer to avoid using the low registers that are in register subclasses required by certain instructions, so that those registers will more likely be available when needed. This change makes a huge improvement in spilling in some cases. Thanks to Jakob for helping me realize the problem. Most of this patch is fixing the testsuite. There are quite a few places where we're checking for specific registers. I changed those to wildcards in places where that doesn't weaken the tests. The spill-q.ll and thumb2-spill-q.ll tests stopped spilling with this change, so I added a bunch of live values to force spills on those tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@116055 91177308-0d34-0410-b5e6-96231b3b80d8 Bob Wilson 9 years ago
20 changed file(s) with 238 addition(s) and 114 deletion(s). Raw diff Collapse all Expand all
386386 ARM::D4, ARM::D5, ARM::D6, ARM::D7,
387387 ARM::D8, ARM::D9, ARM::D10, ARM::D11,
388388 ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
389 // VFP3
389 // VFP3: D8-D15 are callee saved and should be allocated last.
390 // Save other low registers for use as DPR_VFP2 and DPR_8 classes.
390391 static const unsigned ARM_DPR_VFP3[] = {
392 ARM::D16, ARM::D17, ARM::D18, ARM::D19,
393 ARM::D20, ARM::D21, ARM::D22, ARM::D23,
394 ARM::D24, ARM::D25, ARM::D26, ARM::D27,
395 ARM::D28, ARM::D29, ARM::D30, ARM::D31,
391396 ARM::D0, ARM::D1, ARM::D2, ARM::D3,
392397 ARM::D4, ARM::D5, ARM::D6, ARM::D7,
393398 ARM::D8, ARM::D9, ARM::D10, ARM::D11,
394 ARM::D12, ARM::D13, ARM::D14, ARM::D15,
395 ARM::D16, ARM::D17, ARM::D18, ARM::D19,
396 ARM::D20, ARM::D21, ARM::D22, ARM::D23,
397 ARM::D24, ARM::D25, ARM::D26, ARM::D27,
398 ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
399 ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
400
399401 DPRClass::iterator
400402 DPRClass::allocation_order_begin(const MachineFunction &MF) const {
401403 const TargetMachine &TM = MF.getTarget();
437439 [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7,
438440 Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15]> {
439441 let SubRegClasses = [(DPR dsub_0, dsub_1)];
442 let MethodProtos = [{
443 iterator allocation_order_begin(const MachineFunction &MF) const;
444 iterator allocation_order_end(const MachineFunction &MF) const;
445 }];
446 let MethodBodies = [{
447 // Q4-Q7 are callee saved and should be allocated last.
448 // Save other low registers for use as QPR_VFP2 and QPR_8 classes.
449 static const unsigned ARM_QPR[] = {
450 ARM::Q8, ARM::Q9, ARM::Q10, ARM::Q11,
451 ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15,
452 ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3,
453 ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7 };
454
455 QPRClass::iterator
456 QPRClass::allocation_order_begin(const MachineFunction &MF) const {
457 return ARM_QPR;
458 }
459
460 QPRClass::iterator
461 QPRClass::allocation_order_end(const MachineFunction &MF) const {
462 return ARM_QPR + (sizeof(ARM_QPR)/sizeof(unsigned));
463 }
464 }];
440465 }
441466
442467 // Subset of QPR that have 32-bit SPR subregs.
462487 [QQ0, QQ1, QQ2, QQ3, QQ4, QQ5, QQ6, QQ7]> {
463488 let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3),
464489 (QPR qsub_0, qsub_1)];
490 let MethodProtos = [{
491 iterator allocation_order_begin(const MachineFunction &MF) const;
492 iterator allocation_order_end(const MachineFunction &MF) const;
493 }];
494 let MethodBodies = [{
495 // QQ2-QQ3 are callee saved and should be allocated last.
496 // Save other low registers for use as QPR_VFP2 and QPR_8 classes.
497 static const unsigned ARM_QQPR[] = {
498 ARM::QQ4, ARM::QQ5, ARM::QQ6, ARM::QQ7,
499 ARM::QQ0, ARM::QQ1, ARM::QQ2, ARM::QQ3 };
500
501 QQPRClass::iterator
502 QQPRClass::allocation_order_begin(const MachineFunction &MF) const {
503 return ARM_QQPR;
504 }
505
506 QQPRClass::iterator
507 QQPRClass::allocation_order_end(const MachineFunction &MF) const {
508 return ARM_QQPR + (sizeof(ARM_QQPR)/sizeof(unsigned));
509 }
510 }];
465511 }
466512
467513 // Subset of QQPR that have 32-bit SPR subregs.
482528 let SubRegClasses = [(DPR dsub_0, dsub_1, dsub_2, dsub_3,
483529 dsub_4, dsub_5, dsub_6, dsub_7),
484530 (QPR qsub_0, qsub_1, qsub_2, qsub_3)];
531 let MethodProtos = [{
532 iterator allocation_order_begin(const MachineFunction &MF) const;
533 iterator allocation_order_end(const MachineFunction &MF) const;
534 }];
535 let MethodBodies = [{
536 // QQQQ1 is callee saved and should be allocated last.
537 // Save QQQQ0 for use as QPR_VFP2 and QPR_8 classes.
538 static const unsigned ARM_QQQQPR[] = {
539 ARM::QQQQ2, ARM::QQQQ3, ARM::QQQQ0, ARM::QQQQ1 };
540
541 QQQQPRClass::iterator
542 QQQQPRClass::allocation_order_begin(const MachineFunction &MF) const {
543 return ARM_QQQQPR;
544 }
545
546 QQQQPRClass::iterator
547 QQQQPRClass::allocation_order_end(const MachineFunction &MF) const {
548 return ARM_QQQQPR + (sizeof(ARM_QQQQPR)/sizeof(unsigned));
549 }
550 }];
485551 }
486552
487553 // Condition code registers.
99 ; %reg1028 gets allocated %Q0, and if %reg1030 is reloaded for the partial
1010 ; redef, it cannot also get %Q0.
1111
12 ; CHECK: vld1.64 {d0, d1}, [r{{.}}]
13 ; CHECK-NOT: vld1.64 {d0, d1}
14 ; CHECK: vmov.f64 d3, d0
12 ; CHECK: vld1.64 {d16, d17}, [r{{.}}]
13 ; CHECK-NOT: vld1.64 {d16, d17}
14 ; CHECK: vmov.f64 d19, d16
1515
1616 define i32 @test(i8* %arg) nounwind {
1717 entry:
1010 define double @t2(double %x) nounwind readnone optsize {
1111 entry:
1212 ; CHECK: t2:
13 ; CHECK: vmov.f64 d1, #3.000000e+00
13 ; CHECK: vmov.f64 d{{.*}}, #3.000000e+00
1414 %0 = fadd double %x, 3.000000e+00
1515 ret double %0
1616 }
1818 define double @t3(double %x) nounwind readnone optsize {
1919 entry:
2020 ; CHECK: t3:
21 ; CHECK: vmov.f64 d1, #-1.300000e+01
21 ; CHECK: vmov.f64 d{{.*}}, #-1.300000e+01
2222 %0 = fmul double %x, -1.300000e+01
2323 ret double %0
2424 }
66 entry:
77 ; CHECK: vmov.I64 q15, #0
88 ; CHECK: vmov.32 d30[0], r0
9 ; CHECK: vmov q0, q15
9 ; CHECK: vmov q8, q15
1010 %tmp = alloca %struct.int32x4_t, align 16
1111 call void asm sideeffect "vmov.I64 q15, #0\0Avmov.32 d30[0], $1\0Avmov ${0:q}, q15\0A", "=*w,r,~{d31},~{d30}"(%struct.int32x4_t* %tmp, i32 8192) nounwind
1212 ret void
1717
1818 define void @t2() nounwind {
1919 entry:
20 ; CHECK: vmov d30, d0
20 ; CHECK: vmov d30, d16
2121 ; CHECK: vmov.32 r0, d30[0]
2222 %asmtmp2 = tail call i32 asm sideeffect "vmov d30, $1\0Avmov.32 $0, d30[0]\0A", "=r,w,~{d30}"(<2 x i32> undef) nounwind
2323 ret void
121121 return2:
122122 ; CHECK: %return2
123123 ; CHECK: vadd.i32
124 ; CHECK: vmov q1, q3
125 ; CHECK-NOT: vmov
126 ; CHECK: vst2.32 {d0, d1, d2, d3}
124 ; CHECK: vmov q9, q11
125 ; CHECK-NOT: vmov
126 ; CHECK: vst2.32 {d16, d17, d18, d19}
127127 %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
128128 %tmp101 = extractvalue %struct.__neon_int32x4x2_t %tmp5, 1 ; <<4 x i32>> [#uses=1]
129129 %tmp102 = add <4 x i32> %tmp100, %tmp101 ; <<4 x i32>> [#uses=1]
135135 define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
136136 ; CHECK: t5:
137137 ; CHECK: vldmia
138 ; CHECK: vmov q1, q0
139 ; CHECK-NOT: vmov
140 ; CHECK: vld2.16 {d0[1], d2[1]}, [r0]
138 ; CHECK: vmov q9, q8
139 ; CHECK-NOT: vmov
140 ; CHECK: vld2.16 {d16[1], d18[1]}, [r0]
141141 ; CHECK-NOT: vmov
142142 ; CHECK: vadd.i16
143143 %tmp0 = bitcast i16* %A to i8* ; [#uses=1]
152152 define <8 x i8> @t6(i8* %A, <8 x i8>* %B) nounwind {
153153 ; CHECK: t6:
154154 ; CHECK: vldr.64
155 ; CHECK: vmov d1, d0
156 ; CHECK-NEXT: vld2.8 {d0[1], d1[1]}
155 ; CHECK: vmov d17, d16
156 ; CHECK-NEXT: vld2.8 {d16[1], d17[1]}
157157 %tmp1 = load <8 x i8>* %B ; <<8 x i8>> [#uses=2]
158158 %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1) ; <%struct.__neon_int8x8x2_t> [#uses=2]
159159 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0 ; <<8 x i8>> [#uses=1]
167167 ; CHECK: t7:
168168 ; CHECK: vld2.32
169169 ; CHECK: vst2.32
170 ; CHECK: vld1.32 {d0, d1},
171 ; CHECK: vmov q1, q0
172 ; CHECK-NOT: vmov
173 ; CHECK: vuzp.32 q0, q1
170 ; CHECK: vld1.32 {d16, d17},
171 ; CHECK: vmov q9, q8
172 ; CHECK-NOT: vmov
173 ; CHECK: vuzp.32 q8, q9
174174 ; CHECK: vst1.32
175175 %0 = bitcast i32* %iptr to i8* ; [#uses=2]
176176 %1 = tail call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %0, i32 1) ; <%struct.__neon_int32x4x2_t> [#uses=2]
187187 ; PR7156
188188 define arm_aapcs_vfpcc i32 @t8() nounwind {
189189 ; CHECK: t8:
190 ; CHECK: vrsqrte.f32 q0, q0
190 ; CHECK: vrsqrte.f32 q8, q8
191191 bb.nph55.bb.nph55.split_crit_edge:
192192 br label %bb3
193193
237237 define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
238238 ; CHECK: t9:
239239 ; CHECK: vldr.64
240 ; CHECK-NOT: vmov d{{.*}}, d0
241 ; CHECK: vmov.i32 d1
242 ; CHECK-NEXT: vstmia r0, {d0, d1}
243 ; CHECK-NEXT: vstmia r0, {d0, d1}
240 ; CHECK-NOT: vmov d{{.*}}, d16
241 ; CHECK: vmov.i32 d17
242 ; CHECK-NEXT: vstmia r0, {d16, d17}
243 ; CHECK-NEXT: vstmia r0, {d16, d17}
244244 %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
245245 %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> ; <<4 x float>> [#uses=1]
246246 store <4 x float> %4, <4 x float>* undef, align 16
268268 define arm_aapcs_vfpcc i32 @t10() nounwind {
269269 entry:
270270 ; CHECK: t10:
271 ; CHECK: vmov.i32 q1, #0x3F000000
272 ; CHECK: vmov d0, d1
273 ; CHECK: vmla.f32 q0, q0, d0[0]
271 ; CHECK: vmov.i32 q9, #0x3F000000
272 ; CHECK: vmov d0, d17
273 ; CHECK: vmla.f32 q8, q8, d0[0]
274274 %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
275275 %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
276276 %2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]
1919 %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
2020 store float 0.000000e+00, float* undef, align 4
2121 %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
22 %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
23 store float 0.000000e+00, float* undef, align 4
24 %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
25 store float 0.000000e+00, float* undef, align 4
26 %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
27 store float 0.000000e+00, float* undef, align 4
28 %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
29 store float 0.000000e+00, float* undef, align 4
30 %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
31 store float 0.000000e+00, float* undef, align 4
32 %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
33 store float 0.000000e+00, float* undef, align 4
34 %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
35 store float 0.000000e+00, float* undef, align 4
36 %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
37 store float 0.000000e+00, float* undef, align 4
38 %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
39 store float 0.000000e+00, float* undef, align 4
40 %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
41 store float 0.000000e+00, float* undef, align 4
2242 %val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1]
2343 br label %bb4
2444
4363 %18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1]
4464 %19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> ; <<2 x float>> [#uses=1]
4565 %20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
46 %21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
66 %tmp1 = fadd <4 x float> %20, %ld3
67 %tmp2 = fadd <4 x float> %tmp1, %ld4
68 %tmp3 = fadd <4 x float> %tmp2, %ld5
69 %tmp4 = fadd <4 x float> %tmp3, %ld6
70 %tmp5 = fadd <4 x float> %tmp4, %ld7
71 %tmp6 = fadd <4 x float> %tmp5, %ld8
72 %tmp7 = fadd <4 x float> %tmp6, %ld9
73 %tmp8 = fadd <4 x float> %tmp7, %ld10
74 %tmp9 = fadd <4 x float> %tmp8, %ld11
75 %21 = fadd <4 x float> %tmp9, %ld12
4776 %22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
4877 %tmp = extractelement <4 x i1> %22, i32 0
4978 br i1 %tmp, label %bb193, label %bb186
160160 ; rdar://7923010
161161 define <4 x i32> @vcgt_zext(<4 x float>* %A, <4 x float>* %B) nounwind {
162162 ;CHECK: vcgt_zext:
163 ;CHECK: vcgt.f32 q0
164 ;CHECK: vmov.i32 q1, #0x1
165 ;CHECK: vand q0, q0, q1
163 ;CHECK: vcgt.f32 q8
164 ;CHECK: vmov.i32 q9, #0x1
165 ;CHECK: vand q8, q8, q9
166166 %tmp1 = load <4 x float>* %A
167167 %tmp2 = load <4 x float>* %B
168168 %tmp3 = fcmp ogt <4 x float> %tmp1, %tmp2
9595
9696 define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind {
9797 entry:
98 ; CHECK: vmov.u16 r0, d0[1]
98 ; CHECK: vmov.u16 r0, d{{.*}}[1]
9999 %arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1]
100100 %out_uint16_t = alloca i16 ; [#uses=1]
101101 %"alloca point" = bitcast i32 0 to i32 ; [#uses=0]
110110
111111 define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind {
112112 entry:
113 ; CHECK: vmov.u8 r0, d0[1]
113 ; CHECK: vmov.u8 r0, d{{.*}}[1]
114114 %arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1]
115115 %out_uint8_t = alloca i8 ; [#uses=1]
116116 %"alloca point" = bitcast i32 0 to i32 ; [#uses=0]
125125
126126 define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind {
127127 entry:
128 ; CHECK: vmov.u16 r0, d0[1]
128 ; CHECK: vmov.u16 r0, d{{.*}}[1]
129129 %arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1]
130130 %out_uint16_t = alloca i16 ; [#uses=1]
131131 %"alloca point" = bitcast i32 0 to i32 ; [#uses=0]
140140
141141 define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind {
142142 entry:
143 ; CHECK: vmov.u8 r0, d0[1]
143 ; CHECK: vmov.u8 r0, d{{.*}}[1]
144144 %arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1]
145145 %out_uint8_t = alloca i8 ; [#uses=1]
146146 %"alloca point" = bitcast i32 0 to i32 ; [#uses=0]
22 define <8 x i8> @vld1i8(i8* %A) nounwind {
33 ;CHECK: vld1i8:
44 ;Check the alignment value. Max for this instruction is 64 bits:
5 ;CHECK: vld1.8 {d0}, [r0, :64]
5 ;CHECK: vld1.8 {d16}, [r0, :64]
66 %tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A, i32 16)
77 ret <8 x i8> %tmp1
88 }
4242 define <16 x i8> @vld1Qi8(i8* %A) nounwind {
4343 ;CHECK: vld1Qi8:
4444 ;Check the alignment value. Max for this instruction is 128 bits:
45 ;CHECK: vld1.8 {d0, d1}, [r0, :64]
45 ;CHECK: vld1.8 {d16, d17}, [r0, :64]
4646 %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
4747 ret <16 x i8> %tmp1
4848 }
5050 define <8 x i16> @vld1Qi16(i16* %A) nounwind {
5151 ;CHECK: vld1Qi16:
5252 ;Check the alignment value. Max for this instruction is 128 bits:
53 ;CHECK: vld1.16 {d0, d1}, [r0, :128]
53 ;CHECK: vld1.16 {d16, d17}, [r0, :128]
5454 %tmp0 = bitcast i16* %A to i8*
5555 %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0, i32 32)
5656 ret <8 x i16> %tmp1
1313 define <8 x i8> @vld2i8(i8* %A) nounwind {
1414 ;CHECK: vld2i8:
1515 ;Check the alignment value. Max for this instruction is 128 bits:
16 ;CHECK: vld2.8 {d0, d1}, [r0, :64]
16 ;CHECK: vld2.8 {d16, d17}, [r0, :64]
1717 %tmp1 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2.v8i8(i8* %A, i32 8)
1818 %tmp2 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 0
1919 %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp1, 1
2424 define <4 x i16> @vld2i16(i16* %A) nounwind {
2525 ;CHECK: vld2i16:
2626 ;Check the alignment value. Max for this instruction is 128 bits:
27 ;CHECK: vld2.16 {d0, d1}, [r0, :128]
27 ;CHECK: vld2.16 {d16, d17}, [r0, :128]
2828 %tmp0 = bitcast i16* %A to i8*
2929 %tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0, i32 32)
3030 %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0
5858 define <1 x i64> @vld2i64(i64* %A) nounwind {
5959 ;CHECK: vld2i64:
6060 ;Check the alignment value. Max for this instruction is 128 bits:
61 ;CHECK: vld1.64 {d0, d1}, [r0, :128]
61 ;CHECK: vld1.64 {d16, d17}, [r0, :128]
6262 %tmp0 = bitcast i64* %A to i8*
6363 %tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0, i32 32)
6464 %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0
7070 define <16 x i8> @vld2Qi8(i8* %A) nounwind {
7171 ;CHECK: vld2Qi8:
7272 ;Check the alignment value. Max for this instruction is 256 bits:
73 ;CHECK: vld2.8 {d0, d1, d2, d3}, [r0, :64]
73 ;CHECK: vld2.8 {d16, d17, d18, d19}, [r0, :64]
7474 %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 8)
7575 %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
7676 %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
8181 define <8 x i16> @vld2Qi16(i16* %A) nounwind {
8282 ;CHECK: vld2Qi16:
8383 ;Check the alignment value. Max for this instruction is 256 bits:
84 ;CHECK: vld2.16 {d0, d1, d2, d3}, [r0, :128]
84 ;CHECK: vld2.16 {d16, d17, d18, d19}, [r0, :128]
8585 %tmp0 = bitcast i16* %A to i8*
8686 %tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0, i32 16)
8787 %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0
9393 define <4 x i32> @vld2Qi32(i32* %A) nounwind {
9494 ;CHECK: vld2Qi32:
9595 ;Check the alignment value. Max for this instruction is 256 bits:
96 ;CHECK: vld2.32 {d0, d1, d2, d3}, [r0, :256]
96 ;CHECK: vld2.32 {d16, d17, d18, d19}, [r0, :256]
9797 %tmp0 = bitcast i32* %A to i8*
9898 %tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0, i32 64)
9999 %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0
1313 define <8 x i8> @vld3i8(i8* %A) nounwind {
1414 ;CHECK: vld3i8:
1515 ;Check the alignment value. Max for this instruction is 64 bits:
16 ;CHECK: vld3.8 {d0, d1, d2}, [r0, :64]
16 ;CHECK: vld3.8 {d16, d17, d18}, [r0, :64]
1717 %tmp1 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A, i32 32)
1818 %tmp2 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 0
1919 %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp1, 2
5757 define <1 x i64> @vld3i64(i64* %A) nounwind {
5858 ;CHECK: vld3i64:
5959 ;Check the alignment value. Max for this instruction is 64 bits:
60 ;CHECK: vld1.64 {d0, d1, d2}, [r0, :64]
60 ;CHECK: vld1.64 {d16, d17, d18}, [r0, :64]
6161 %tmp0 = bitcast i64* %A to i8*
6262 %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0, i32 16)
6363 %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0
6969 define <16 x i8> @vld3Qi8(i8* %A) nounwind {
7070 ;CHECK: vld3Qi8:
7171 ;Check the alignment value. Max for this instruction is 64 bits:
72 ;CHECK: vld3.8 {d0, d2, d4}, [r0, :64]!
73 ;CHECK: vld3.8 {d1, d3, d5}, [r0, :64]
72 ;CHECK: vld3.8 {d16, d18, d20}, [r0, :64]!
73 ;CHECK: vld3.8 {d17, d19, d21}, [r0, :64]
7474 %tmp1 = call %struct.__neon_int8x16x3_t @llvm.arm.neon.vld3.v16i8(i8* %A, i32 32)
7575 %tmp2 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 0
7676 %tmp3 = extractvalue %struct.__neon_int8x16x3_t %tmp1, 2
1313 define <8 x i8> @vld4i8(i8* %A) nounwind {
1414 ;CHECK: vld4i8:
1515 ;Check the alignment value. Max for this instruction is 256 bits:
16 ;CHECK: vld4.8 {d0, d1, d2, d3}, [r0, :64]
16 ;CHECK: vld4.8 {d16, d17, d18, d19}, [r0, :64]
1717 %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 8)
1818 %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
1919 %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
2424 define <4 x i16> @vld4i16(i16* %A) nounwind {
2525 ;CHECK: vld4i16:
2626 ;Check the alignment value. Max for this instruction is 256 bits:
27 ;CHECK: vld4.16 {d0, d1, d2, d3}, [r0, :128]
27 ;CHECK: vld4.16 {d16, d17, d18, d19}, [r0, :128]
2828 %tmp0 = bitcast i16* %A to i8*
2929 %tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0, i32 16)
3030 %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0
3636 define <2 x i32> @vld4i32(i32* %A) nounwind {
3737 ;CHECK: vld4i32:
3838 ;Check the alignment value. Max for this instruction is 256 bits:
39 ;CHECK: vld4.32 {d0, d1, d2, d3}, [r0, :256]
39 ;CHECK: vld4.32 {d16, d17, d18, d19}, [r0, :256]
4040 %tmp0 = bitcast i32* %A to i8*
4141 %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0, i32 32)
4242 %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0
5959 define <1 x i64> @vld4i64(i64* %A) nounwind {
6060 ;CHECK: vld4i64:
6161 ;Check the alignment value. Max for this instruction is 256 bits:
62 ;CHECK: vld1.64 {d0, d1, d2, d3}, [r0, :256]
62 ;CHECK: vld1.64 {d16, d17, d18, d19}, [r0, :256]
6363 %tmp0 = bitcast i64* %A to i8*
6464 %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0, i32 64)
6565 %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0
7171 define <16 x i8> @vld4Qi8(i8* %A) nounwind {
7272 ;CHECK: vld4Qi8:
7373 ;Check the alignment value. Max for this instruction is 256 bits:
74 ;CHECK: vld4.8 {d0, d2, d4, d6}, [r0, :256]!
75 ;CHECK: vld4.8 {d1, d3, d5, d7}, [r0, :256]
74 ;CHECK: vld4.8 {d16, d18, d20, d22}, [r0, :256]!
75 ;CHECK: vld4.8 {d17, d19, d21, d23}, [r0, :256]
7676 %tmp1 = call %struct.__neon_int8x16x4_t @llvm.arm.neon.vld4.v16i8(i8* %A, i32 64)
7777 %tmp2 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 0
7878 %tmp3 = extractvalue %struct.__neon_int8x16x4_t %tmp1, 2
8383 define <8 x i16> @vld4Qi16(i16* %A) nounwind {
8484 ;CHECK: vld4Qi16:
8585 ;Check for no alignment specifier.
86 ;CHECK: vld4.16 {d0, d2, d4, d6}, [r0]!
87 ;CHECK: vld4.16 {d1, d3, d5, d7}, [r0]
86 ;CHECK: vld4.16 {d16, d18, d20, d22}, [r0]!
87 ;CHECK: vld4.16 {d17, d19, d21, d23}, [r0]
8888 %tmp0 = bitcast i16* %A to i8*
8989 %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 1)
9090 %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
11
22 define <8 x i8> @v_movi8() nounwind {
33 ;CHECK: v_movi8:
4 ;CHECK: vmov.i8 d0, #0x8
4 ;CHECK: vmov.i8 d{{.*}}, #0x8
55 ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
66 }
77
88 define <4 x i16> @v_movi16a() nounwind {
99 ;CHECK: v_movi16a:
10 ;CHECK: vmov.i16 d0, #0x10
10 ;CHECK: vmov.i16 d{{.*}}, #0x10
1111 ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
1212 }
1313
1414 define <4 x i16> @v_movi16b() nounwind {
1515 ;CHECK: v_movi16b:
16 ;CHECK: vmov.i16 d0, #0x1000
16 ;CHECK: vmov.i16 d{{.*}}, #0x1000
1717 ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
1818 }
1919
2020 define <4 x i16> @v_mvni16a() nounwind {
2121 ;CHECK: v_mvni16a:
22 ;CHECK: vmvn.i16 d0, #0x10
22 ;CHECK: vmvn.i16 d{{.*}}, #0x10
2323 ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
2424 }
2525
2626 define <4 x i16> @v_mvni16b() nounwind {
2727 ;CHECK: v_mvni16b:
28 ;CHECK: vmvn.i16 d0, #0x1000
28 ;CHECK: vmvn.i16 d{{.*}}, #0x1000
2929 ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
3030 }
3131
3232 define <2 x i32> @v_movi32a() nounwind {
3333 ;CHECK: v_movi32a:
34 ;CHECK: vmov.i32 d0, #0x20
34 ;CHECK: vmov.i32 d{{.*}}, #0x20
3535 ret <2 x i32> < i32 32, i32 32 >
3636 }
3737
3838 define <2 x i32> @v_movi32b() nounwind {
3939 ;CHECK: v_movi32b:
40 ;CHECK: vmov.i32 d0, #0x2000
40 ;CHECK: vmov.i32 d{{.*}}, #0x2000
4141 ret <2 x i32> < i32 8192, i32 8192 >
4242 }
4343
4444 define <2 x i32> @v_movi32c() nounwind {
4545 ;CHECK: v_movi32c:
46 ;CHECK: vmov.i32 d0, #0x200000
46 ;CHECK: vmov.i32 d{{.*}}, #0x200000
4747 ret <2 x i32> < i32 2097152, i32 2097152 >
4848 }
4949
5050 define <2 x i32> @v_movi32d() nounwind {
5151 ;CHECK: v_movi32d:
52 ;CHECK: vmov.i32 d0, #0x20000000
52 ;CHECK: vmov.i32 d{{.*}}, #0x20000000
5353 ret <2 x i32> < i32 536870912, i32 536870912 >
5454 }
5555
5656 define <2 x i32> @v_movi32e() nounwind {
5757 ;CHECK: v_movi32e:
58 ;CHECK: vmov.i32 d0, #0x20FF
58 ;CHECK: vmov.i32 d{{.*}}, #0x20FF
5959 ret <2 x i32> < i32 8447, i32 8447 >
6060 }
6161
6262 define <2 x i32> @v_movi32f() nounwind {
6363 ;CHECK: v_movi32f:
64 ;CHECK: vmov.i32 d0, #0x20FFFF
64 ;CHECK: vmov.i32 d{{.*}}, #0x20FFFF
6565 ret <2 x i32> < i32 2162687, i32 2162687 >
6666 }
6767
6868 define <2 x i32> @v_mvni32a() nounwind {
6969 ;CHECK: v_mvni32a:
70 ;CHECK: vmvn.i32 d0, #0x20
70 ;CHECK: vmvn.i32 d{{.*}}, #0x20
7171 ret <2 x i32> < i32 4294967263, i32 4294967263 >
7272 }
7373
7474 define <2 x i32> @v_mvni32b() nounwind {
7575 ;CHECK: v_mvni32b:
76 ;CHECK: vmvn.i32 d0, #0x2000
76 ;CHECK: vmvn.i32 d{{.*}}, #0x2000
7777 ret <2 x i32> < i32 4294959103, i32 4294959103 >
7878 }
7979
8080 define <2 x i32> @v_mvni32c() nounwind {
8181 ;CHECK: v_mvni32c:
82 ;CHECK: vmvn.i32 d0, #0x200000
82 ;CHECK: vmvn.i32 d{{.*}}, #0x200000
8383 ret <2 x i32> < i32 4292870143, i32 4292870143 >
8484 }
8585
8686 define <2 x i32> @v_mvni32d() nounwind {
8787 ;CHECK: v_mvni32d:
88 ;CHECK: vmvn.i32 d0, #0x20000000
88 ;CHECK: vmvn.i32 d{{.*}}, #0x20000000
8989 ret <2 x i32> < i32 3758096383, i32 3758096383 >
9090 }
9191
9292 define <2 x i32> @v_mvni32e() nounwind {
9393 ;CHECK: v_mvni32e:
94 ;CHECK: vmvn.i32 d0, #0x20FF
94 ;CHECK: vmvn.i32 d{{.*}}, #0x20FF
9595 ret <2 x i32> < i32 4294958848, i32 4294958848 >
9696 }
9797
9898 define <2 x i32> @v_mvni32f() nounwind {
9999 ;CHECK: v_mvni32f:
100 ;CHECK: vmvn.i32 d0, #0x20FFFF
100 ;CHECK: vmvn.i32 d{{.*}}, #0x20FFFF
101101 ret <2 x i32> < i32 4292804608, i32 4292804608 >
102102 }
103103
104104 define <1 x i64> @v_movi64() nounwind {
105105 ;CHECK: v_movi64:
106 ;CHECK: vmov.i64 d0, #0xFF0000FF0000FFFF
106 ;CHECK: vmov.i64 d{{.*}}, #0xFF0000FF0000FFFF
107107 ret <1 x i64> < i64 18374687574888349695 >
108108 }
109109
110110 define <16 x i8> @v_movQi8() nounwind {
111111 ;CHECK: v_movQi8:
112 ;CHECK: vmov.i8 q0, #0x8
112 ;CHECK: vmov.i8 q{{.*}}, #0x8
113113 ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
114114 }
115115
116116 define <8 x i16> @v_movQi16a() nounwind {
117117 ;CHECK: v_movQi16a:
118 ;CHECK: vmov.i16 q0, #0x10
118 ;CHECK: vmov.i16 q{{.*}}, #0x10
119119 ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
120120 }
121121
122122 define <8 x i16> @v_movQi16b() nounwind {
123123 ;CHECK: v_movQi16b:
124 ;CHECK: vmov.i16 q0, #0x1000
124 ;CHECK: vmov.i16 q{{.*}}, #0x1000
125125 ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
126126 }
127127
128128 define <4 x i32> @v_movQi32a() nounwind {
129129 ;CHECK: v_movQi32a:
130 ;CHECK: vmov.i32 q0, #0x20
130 ;CHECK: vmov.i32 q{{.*}}, #0x20
131131 ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
132132 }
133133
134134 define <4 x i32> @v_movQi32b() nounwind {
135135 ;CHECK: v_movQi32b:
136 ;CHECK: vmov.i32 q0, #0x2000
136 ;CHECK: vmov.i32 q{{.*}}, #0x2000
137137 ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
138138 }
139139
140140 define <4 x i32> @v_movQi32c() nounwind {
141141 ;CHECK: v_movQi32c:
142 ;CHECK: vmov.i32 q0, #0x200000
142 ;CHECK: vmov.i32 q{{.*}}, #0x200000
143143 ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
144144 }
145145
146146 define <4 x i32> @v_movQi32d() nounwind {
147147 ;CHECK: v_movQi32d:
148 ;CHECK: vmov.i32 q0, #0x20000000
148 ;CHECK: vmov.i32 q{{.*}}, #0x20000000
149149 ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
150150 }
151151
152152 define <4 x i32> @v_movQi32e() nounwind {
153153 ;CHECK: v_movQi32e:
154 ;CHECK: vmov.i32 q0, #0x20FF
154 ;CHECK: vmov.i32 q{{.*}}, #0x20FF
155155 ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
156156 }
157157
158158 define <4 x i32> @v_movQi32f() nounwind {
159159 ;CHECK: v_movQi32f:
160 ;CHECK: vmov.i32 q0, #0x20FFFF
160 ;CHECK: vmov.i32 q{{.*}}, #0x20FFFF
161161 ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
162162 }
163163
164164 define <2 x i64> @v_movQi64() nounwind {
165165 ;CHECK: v_movQi64:
166 ;CHECK: vmov.i64 q0, #0xFF0000FF0000FFFF
166 ;CHECK: vmov.i64 q{{.*}}, #0xFF0000FF0000FFFF
167167 ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
168168 }
169169
172172 define void @vdupn128(%struct.int8x8_t* noalias nocapture sret %agg.result) nounwind {
173173 entry:
174174 ;CHECK: vdupn128:
175 ;CHECK: vmov.i8 d0, #0x80
175 ;CHECK: vmov.i8 d{{.*}}, #0x80
176176 %0 = getelementptr inbounds %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1]
177177 store <8 x i8> , <8 x i8>* %0, align 8
178178 ret void
181181 define void @vdupnneg75(%struct.int8x8_t* noalias nocapture sret %agg.result) nounwind {
182182 entry:
183183 ;CHECK: vdupnneg75:
184 ;CHECK: vmov.i8 d0, #0xB5
184 ;CHECK: vmov.i8 d{{.*}}, #0xB5
185185 %0 = getelementptr inbounds %struct.int8x8_t* %agg.result, i32 0, i32 0 ; <<8 x i8>*> [#uses=1]
186186 store <8 x i8> , <8 x i8>* %0, align 8
187187 ret void
22 define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
33 ;CHECK: vst1i8:
44 ;Check the alignment value. Max for this instruction is 64 bits:
5 ;CHECK: vst1.8 {d0}, [r0, :64]
5 ;CHECK: vst1.8 {d16}, [r0, :64]
66 %tmp1 = load <8 x i8>* %B
77 call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
88 ret void
4747 define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
4848 ;CHECK: vst1Qi8:
4949 ;Check the alignment value. Max for this instruction is 128 bits:
50 ;CHECK: vst1.8 {d0, d1}, [r0, :64]
50 ;CHECK: vst1.8 {d16, d17}, [r0, :64]
5151 %tmp1 = load <16 x i8>* %B
5252 call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
5353 ret void
5656 define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
5757 ;CHECK: vst1Qi16:
5858 ;Check the alignment value. Max for this instruction is 128 bits:
59 ;CHECK: vst1.16 {d0, d1}, [r0, :128]
59 ;CHECK: vst1.16 {d16, d17}, [r0, :128]
6060 %tmp0 = bitcast i16* %A to i8*
6161 %tmp1 = load <8 x i16>* %B
6262 call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
22 define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
33 ;CHECK: vst2i8:
44 ;Check the alignment value. Max for this instruction is 128 bits:
5 ;CHECK: vst2.8 {d0, d1}, [r0, :64]
5 ;CHECK: vst2.8 {d16, d17}, [r0, :64]
66 %tmp1 = load <8 x i8>* %B
77 call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
88 ret void
1111 define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
1212 ;CHECK: vst2i16:
1313 ;Check the alignment value. Max for this instruction is 128 bits:
14 ;CHECK: vst2.16 {d0, d1}, [r0, :128]
14 ;CHECK: vst2.16 {d16, d17}, [r0, :128]
1515 %tmp0 = bitcast i16* %A to i8*
1616 %tmp1 = load <4 x i16>* %B
1717 call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
3939 define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
4040 ;CHECK: vst2i64:
4141 ;Check the alignment value. Max for this instruction is 128 bits:
42 ;CHECK: vst1.64 {d0, d1}, [r0, :128]
42 ;CHECK: vst1.64 {d16, d17}, [r0, :128]
4343 %tmp0 = bitcast i64* %A to i8*
4444 %tmp1 = load <1 x i64>* %B
4545 call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 32)
4949 define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
5050 ;CHECK: vst2Qi8:
5151 ;Check the alignment value. Max for this instruction is 256 bits:
52 ;CHECK: vst2.8 {d0, d1, d2, d3}, [r0, :64]
52 ;CHECK: vst2.8 {d16, d17, d18, d19}, [r0, :64]
5353 %tmp1 = load <16 x i8>* %B
5454 call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
5555 ret void
5858 define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
5959 ;CHECK: vst2Qi16:
6060 ;Check the alignment value. Max for this instruction is 256 bits:
61 ;CHECK: vst2.16 {d0, d1, d2, d3}, [r0, :128]
61 ;CHECK: vst2.16 {d16, d17, d18, d19}, [r0, :128]
6262 %tmp0 = bitcast i16* %A to i8*
6363 %tmp1 = load <8 x i16>* %B
6464 call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
6868 define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
6969 ;CHECK: vst2Qi32:
7070 ;Check the alignment value. Max for this instruction is 256 bits:
71 ;CHECK: vst2.32 {d0, d1, d2, d3}, [r0, :256]
71 ;CHECK: vst2.32 {d16, d17, d18, d19}, [r0, :256]
7272 %tmp0 = bitcast i32* %A to i8*
7373 %tmp1 = load <4 x i32>* %B
7474 call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
22 define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
33 ;CHECK: vst4i8:
44 ;Check the alignment value. Max for this instruction is 256 bits:
5 ;CHECK: vst4.8 {d0, d1, d2, d3}, [r0, :64]
5 ;CHECK: vst4.8 {d16, d17, d18, d19}, [r0, :64]
66 %tmp1 = load <8 x i8>* %B
77 call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
88 ret void
1111 define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
1212 ;CHECK: vst4i16:
1313 ;Check the alignment value. Max for this instruction is 256 bits:
14 ;CHECK: vst4.16 {d0, d1, d2, d3}, [r0, :128]
14 ;CHECK: vst4.16 {d16, d17, d18, d19}, [r0, :128]
1515 %tmp0 = bitcast i16* %A to i8*
1616 %tmp1 = load <4 x i16>* %B
1717 call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
2121 define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
2222 ;CHECK: vst4i32:
2323 ;Check the alignment value. Max for this instruction is 256 bits:
24 ;CHECK: vst4.32 {d0, d1, d2, d3}, [r0, :256]
24 ;CHECK: vst4.32 {d16, d17, d18, d19}, [r0, :256]
2525 %tmp0 = bitcast i32* %A to i8*
2626 %tmp1 = load <2 x i32>* %B
2727 call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
4040 define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind {
4141 ;CHECK: vst4i64:
4242 ;Check the alignment value. Max for this instruction is 256 bits:
43 ;CHECK: vst1.64 {d0, d1, d2, d3}, [r0, :256]
43 ;CHECK: vst1.64 {d16, d17, d18, d19}, [r0, :256]
4444 %tmp0 = bitcast i64* %A to i8*
4545 %tmp1 = load <1 x i64>* %B
4646 call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 64)
5050 define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind {
5151 ;CHECK: vst4Qi8:
5252 ;Check the alignment value. Max for this instruction is 256 bits:
53 ;CHECK: vst4.8 {d0, d2, d4, d6}, [r0, :256]!
54 ;CHECK: vst4.8 {d1, d3, d5, d7}, [r0, :256]
53 ;CHECK: vst4.8 {d16, d18, d20, d22}, [r0, :256]!
54 ;CHECK: vst4.8 {d17, d19, d21, d23}, [r0, :256]
5555 %tmp1 = load <16 x i8>* %B
5656 call void @llvm.arm.neon.vst4.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 64)
5757 ret void
6060 define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind {
6161 ;CHECK: vst4Qi16:
6262 ;Check for no alignment specifier.
63 ;CHECK: vst4.16 {d0, d2, d4, d6}, [r0]!
64 ;CHECK: vst4.16 {d1, d3, d5, d7}, [r0]
63 ;CHECK: vst4.16 {d16, d18, d20, d22}, [r0]!
64 ;CHECK: vst4.16 {d17, d19, d21, d23}, [r0]
6565 %tmp0 = bitcast i16* %A to i8*
6666 %tmp1 = load <8 x i16>* %B
6767 call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
2222 %4 = insertelement <2 x double> %2, double %V.0.ph, i32 1 ; <<2 x double>> [#uses=2]
2323 ; Constant pool load followed by add.
2424 ; Then clobber the loaded register, not the sum.
25 ; CHECK: vldr.64 [[LDR:d.]]
26 ; CHECK: vadd.f64 [[ADD:d.]], [[LDR]], [[LDR]]
25 ; CHECK: vldr.64 [[LDR:d.*]],
26 ; CHECK: vadd.f64 [[ADD:d.*]], [[LDR]], [[LDR]]
2727 ; CHECK: vmov.f64 [[LDR]]
2828 %5 = fadd <2 x double> %3, %3 ; <<2 x double>> [#uses=2]
2929 %6 = fadd <2 x double> %4, %4 ; <<2 x double>> [#uses=2]
1818 %0 = fmul double %a, %b
1919 ; CORTEXM3: blx ___muldf3
2020 ; CORTEXM4: blx ___muldf3
21 ; CORTEXA8: vmul.f64 d0, d1, d0
21 ; CORTEXA8: vmul.f64 d16, d17, d16
2222 ret double %0
2323 }
5555 entry:
5656 ; CHECK: t2:
5757 ; CHECK: adr r{{.}}, #LCPI1_0
58 ; CHECK: vldmia r3, {d0, d1}
58 ; CHECK: vldmia r3, {d16, d17}
5959 br i1 undef, label %bb1, label %bb2
6060
6161 bb1:
1919 %1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
2020 store float 0.000000e+00, float* undef, align 4
2121 %2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind ; <<4 x float>> [#uses=1]
22 %ld3 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
23 store float 0.000000e+00, float* undef, align 4
24 %ld4 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
25 store float 0.000000e+00, float* undef, align 4
26 %ld5 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
27 store float 0.000000e+00, float* undef, align 4
28 %ld6 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
29 store float 0.000000e+00, float* undef, align 4
30 %ld7 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
31 store float 0.000000e+00, float* undef, align 4
32 %ld8 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
33 store float 0.000000e+00, float* undef, align 4
34 %ld9 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
35 store float 0.000000e+00, float* undef, align 4
36 %ld10 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
37 store float 0.000000e+00, float* undef, align 4
38 %ld11 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
39 store float 0.000000e+00, float* undef, align 4
40 %ld12 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* undef, i32 1) nounwind
41 store float 0.000000e+00, float* undef, align 4
2242 %val173 = load <4 x float>* undef ; <<4 x float>> [#uses=1]
2343 br label %bb4
2444
4363 %18 = fmul <4 x float> %17, %val173 ; <<4 x float>> [#uses=1]
4464 %19 = shufflevector <4 x float> %18, <4 x float> undef, <2 x i32> ; <<2 x float>> [#uses=1]
4565 %20 = shufflevector <2 x float> %19, <2 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
46 %21 = fadd <4 x float> zeroinitializer, %20 ; <<4 x float>> [#uses=2]
66 %tmp1 = fadd <4 x float> %20, %ld3
67 %tmp2 = fadd <4 x float> %tmp1, %ld4
68 %tmp3 = fadd <4 x float> %tmp2, %ld5
69 %tmp4 = fadd <4 x float> %tmp3, %ld6
70 %tmp5 = fadd <4 x float> %tmp4, %ld7
71 %tmp6 = fadd <4 x float> %tmp5, %ld8
72 %tmp7 = fadd <4 x float> %tmp6, %ld9
73 %tmp8 = fadd <4 x float> %tmp7, %ld10
74 %tmp9 = fadd <4 x float> %tmp8, %ld11
75 %21 = fadd <4 x float> %tmp9, %ld12
4776 %22 = fcmp ogt <4 x float> %besterror.0.2264, %21 ; <<4 x i1>> [#uses=0]
4877 %tmp = extractelement <4 x i1> %22, i32 0
4978 br i1 %tmp, label %bb193, label %bb186