llvm.org GIT mirror llvm / 82509e5
Fix a number of problems with ARM fused multiply add/subtract instructions. 1. The new instruction itinerary entries are not properly described. 2. The asm parser can't handle vfms and vfnms. 3. There were no assembler, disassembler test cases. 4. HasNEON2 has the wrong assembler predicate. rdar://10139676 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154456 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 8 years ago
10 changed file(s) with 160 addition(s) and 10 deletion(s). Raw diff Collapse all Expand all
7575 def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
7676 "true",
7777 "Use NEON for single precision FP">;
78 // Allow more precision in FP computation
79 def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
8078
8179 // Disable 32-bit to 16-bit narrowing for experimentation.
8280 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
180180 AssemblerPredicate<"FeatureVFP3">;
181181 def HasVFP4 : Predicate<"Subtarget->hasVFP4()">,
182182 AssemblerPredicate<"FeatureVFP4">;
183 def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">;
183 def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">;
184184 def HasNEON : Predicate<"Subtarget->hasNEON()">,
185185 AssemblerPredicate<"FeatureNEON">;
186186 def HasNEON2 : Predicate<"Subtarget->hasNEON2()">,
187 AssemblerPredicate<"FeatureNEON2">;
187 AssemblerPredicate<"FeatureNEON,FeatureVFP4">;
188188 def NoNEON2 : Predicate<"!Subtarget->hasNEON2()">;
189189 def HasFP16 : Predicate<"Subtarget->hasFP16()">,
190190 AssemblerPredicate<"FeatureFP16">;
219219 def UseMovt : Predicate<"Subtarget->useMovt()">;
220220 def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
221221 def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
222
223 // Allow more precision in FP computation
224 def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
222225
223226 //===----------------------------------------------------------------------===//
224227 // ARM Flag Definitions.
41144114 "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
41154115 defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
41164116
4117
41184117 // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
41194118 def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
41204119 v2f32, fmul_su, fadd_mlx>,
41354134 // Match @llvm.fma.* intrinsics
41364135 def : Pat<(fma (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm)),
41374136 (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
4138 Requires<[HasNEON, HasVFP4]>;
4137 Requires<[HasNEON2]>;
41394138 def : Pat<(fma (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm)),
41404139 (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
4141 Requires<[HasNEON, HasVFP4]>;
4140 Requires<[HasNEON2]>;
41424141
41434142 // Vector Subtract Operations.
41444143
54965495 def : N3VSMulOpPat,
54975496 Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>;
54985497 def : N3VSMulOpPat,
5499 Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
5498 Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
55005499 def : N3VSMulOpPat,
5501 Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
5500 Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
55025501 def : N2VSPat;
55035502 def : N2VSPat;
55045503 def : N3VSPat;
323323 InstrStage<19, [A8_NPipe], 0>,
324324 InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
325325 //
326 // Single-precision Fused FP MAC
327 InstrItinData,
328 InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
329 //
330 // Double-precision Fused FP MAC
331 InstrItinData,
332 InstrStage<19, [A8_NPipe], 0>,
333 InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
334 //
326335 // Single-precision FP DIV
327336 InstrItinData,
328337 InstrStage<20, [A8_NPipe], 0>,
857866 // Result written in N9, but that is relative to the last cycle of multicycle,
858867 // so we use 10 for those cases
859868 InstrItinData,
869 InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
870 //
871 // Double-register Fused FP Multiple-Accumulate
872 InstrItinData,
873 InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
874 //
875 // Quad-register Fused FP Multiple-Accumulate
876 // Result written in N9, but that is relative to the last cycle of multicycle,
877 // so we use 10 for those cases
878 InstrItinData,
860879 InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
861880 //
862881 // Double-register Reciprical Step
603603 InstrStage<2, [A9_NPipe]>],
604604 [9, 1, 1, 1]>,
605605 //
606 // Single-precision Fused FP MAC
607 InstrItinData,
608 InstrStage<1, [A9_MUX0], 0>,
609 InstrStage<1, [A9_DRegsVFP], 0, Required>,
610 InstrStage<9, [A9_DRegsN], 0, Reserved>,
611 InstrStage<1, [A9_NPipe]>],
612 [8, 1, 1, 1]>,
613 //
614 // Double-precision Fused FP MAC
615 InstrItinData,
616 InstrStage<1, [A9_MUX0], 0>,
617 InstrStage<1, [A9_DRegsVFP], 0, Required>,
618 InstrStage<10, [A9_DRegsN], 0, Reserved>,
619 InstrStage<2, [A9_NPipe]>],
620 [9, 1, 1, 1]>,
621 //
606622 // Single-precision FP DIV
607623 InstrItinData,
608624 InstrStage<1, [A9_MUX0], 0>,
16961712 InstrStage<4, [A9_NPipe]>],
16971713 [8, 4, 2, 1]>,
16981714 //
1715 // Double-register Fused FP Multiple-Accumulate
1716 InstrItinData,
1717 InstrStage<1, [A9_MUX0], 0>,
1718 InstrStage<1, [A9_DRegsN], 0, Required>,
1719 // Extra latency cycles since wbck is 7 cycles
1720 InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
1721 InstrStage<2, [A9_NPipe]>],
1722 [6, 3, 2, 1]>,
1723 //
1724 // Quad-register Fused FP Multiple-Accumulate
1725 // Result written in N9, but that is relative to the last cycle of multicycle,
1726 // so we use 10 for those cases
1727 InstrItinData,
1728 InstrStage<1, [A9_MUX0], 0>,
1729 InstrStage<1, [A9_DRegsN], 0, Required>,
1730 // Extra latency cycles since wbck is 9 cycles
1731 InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
1732 InstrStage<4, [A9_NPipe]>],
1733 [8, 4, 2, 1]>,
1734 //
16991735 // Double-register Reciprical Step
17001736 InstrItinData,
17011737 InstrStage<1, [A9_MUX0], 0>,
242242 // Double-precision FP MAC
243243 InstrItinData], [9, 2, 2, 2]>,
244244 //
245 // Single-precision Fused FP MAC
246 InstrItinData], [9, 2, 2, 2]>,
247 //
248 // Double-precision Fused FP MAC
249 InstrItinData], [9, 2, 2, 2]>,
250 //
245251 // Single-precision FP DIV
246252 InstrItinData], [20, 2, 2]>,
247253 //
4444 bool HasV6T2Ops;
4545 bool HasV7Ops;
4646
47 /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what
47 /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEON2 - Specify what
4848 /// floating point ISAs are supported.
4949 bool HasVFPv2;
5050 bool HasVFPv3;
46584658 Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
46594659 Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
46604660 Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
4661 Mnemonic == "vfms" || Mnemonic == "vfnms" ||
46614662 (Mnemonic == "movs" && isThumb()))) {
46624663 Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
46634664 CarrySetting = true;
47014702 Mnemonic == "orr" || Mnemonic == "mvn" ||
47024703 Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
47034704 Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
4705 Mnemonic == "vfm" || Mnemonic == "vfnm" ||
47044706 (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
47054707 Mnemonic == "mla" || Mnemonic == "smlal" ||
47064708 Mnemonic == "umlal" || Mnemonic == "umull"))) {
0 @ RUN: llvm-mc < %s -triple armv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=ARM
1 @ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=THUMB
2
3 @ ARM: vfma.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xe2,0xee]
4 @ THUMB: vfma.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xa1,0x0b]
5 vfma.f64 d16, d18, d17
6
7 @ ARM: vfma.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0xa2,0xee]
8 @ THUMB: vfma.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x00,0x1a]
9 vfma.f32 s2, s4, s0
10
11 @ ARM: vfma.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x42,0xf2]
12 @ THUMB: vfma.f32 d16, d18, d17 @ encoding: [0x42,0xef,0xb1,0x0c]
13 vfma.f32 d16, d18, d17
14
15 @ ARM: vfma.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x08,0xf2]
16 @ THUMB: vfma.f32 q2, q4, q0 @ encoding: [0x08,0xef,0x50,0x4c]
17 vfma.f32 q2, q4, q0
18
19 @ ARM: vfnma.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xd2,0xee]
20 @ THUMB: vfnma.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xe1,0x0b]
21 vfnma.f64 d16, d18, d17
22
23 @ ARM: vfnma.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0x92,0xee]
24 @ THUMB: vfnma.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x40,0x1a]
25 vfnma.f32 s2, s4, s0
26
27 @ ARM: vfms.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xe2,0xee]
28 @ THUMB: vfms.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xe1,0x0b]
29 vfms.f64 d16, d18, d17
30
31 @ ARM: vfms.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0xa2,0xee]
32 @ THUMB: vfms.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x40,0x1a]
33 vfms.f32 s2, s4, s0
34
35 @ ARM: vfms.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x62,0xf2]
36 @ THUMB: vfms.f32 d16, d18, d17 @ encoding: [0x62,0xef,0xb1,0x0c]
37 vfms.f32 d16, d18, d17
38
39 @ ARM: vfms.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x28,0xf2]
40 @ THUMB: vfms.f32 q2, q4, q0 @ encoding: [0x28,0xef,0x50,0x4c]
41 vfms.f32 q2, q4, q0
42
43 @ ARM: vfnms.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xd2,0xee]
44 @ THUMB: vfnms.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xa1,0x0b]
45 vfnms.f64 d16, d18, d17
46
47 @ ARM: vfnms.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0x92,0xee]
48 @ THUMB: vfnms.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x00,0x1a]
49 vfnms.f32 s2, s4, s0
0 # RUN: llvm-mc < %s -triple thumbv7-unknown-unknown --disassemble -mattr=+neon,+vfp4 | FileCheck %s
1
2 # CHECK: vfma.f64 d16, d18, d17
3 0xe2 0xee 0xa1 0x0b
4
5 # CHECK: vfma.f32 s2, s4, s0
6 0xa2 0xee 0x00 0x1a
7
8 # CHECK: vfma.f32 d16, d18, d17
9 0x42 0xef 0xb1 0x0c
10
11 # CHECK: vfma.f32 q2, q4, q0
12 0x08 0xef 0x50 0x4c
13
14 # CHECK: vfnms.f64 d16, d18, d17
15 0xd2 0xee 0xa1 0x0b
16
17 # CHECK: vfnms.f32 s2, s4, s0
18 0x92 0xee 0x00 0x1a
19
20 # CHECK: vfms.f64 d16, d18, d17
21 0xe2 0xee 0xe1 0x0b
22
23 # CHECK: vfms.f32 s2, s4, s0
24 0xa2 0xee 0x40 0x1a
25
26 # CHECK: vfms.f32 d16, d18, d17
27 0x62 0xef 0xb1 0x0c
28
29 # CHECK: vfms.f32 q2, q4, q0
30 0x28 0xef 0x50 0x4c
31
32 # CHECK: vfnma.f64 d16, d18, d17
33 0xd2 0xee 0xe1 0x0b
34
35 # CHECK: vfnma.f32 s2, s4, s0
36 0x92 0xee 0x40 0x1a