llvm.org GIT mirror llvm / 3b4e0d3
[ARM] Armv8.2-A FP16 code generation (part 1/3) This is the groundwork for Armv8.2-A FP16 code generation . Clang passes and returns _Float16 values as floats, together with the required bitconverts and truncs etc. to implement correct AAPCS behaviour, see D42318. We will implement half-precision argument passing/returning lowering in the ARM backend soon, but for now this means that this: _Float16 sub(_Float16 a, _Float16 b) { return a + b; } gets lowered to this: define float @sub(float %a.coerce, float %b.coerce) { entry: %0 = bitcast float %a.coerce to i32 %tmp.0.extract.trunc = trunc i32 %0 to i16 %1 = bitcast i16 %tmp.0.extract.trunc to half <SNIP> %add = fadd half %1, %3 <SNIP> } When FullFP16 is *not* supported, we don't make f16 a legal type, and we get legalization for "free", i.e. nothing changes and everything works as before. And also f16 argument passing/returning is handled. When FullFP16 is supported, we do make f16 a legal type, and have 2 places that we need to patch up: f16 argument passing and returning, which involves minor tweaks to avoid unnecessary code generation for some bitcasts. As a "demonstrator" that this works for the different FP16, FullFP16, softfp modes, etc., I've added match rules to the VSUB instruction description showing that we can codegen this instruction from IR, but more importantly, also to some conversion instructions. These conversions were causing issue before in the FP16 and FullFP16 cases. I've also added match rules to the VLDRH and VSTRH desriptions, so that we can actually compile the entire half-precision sub code example above. This showed that these loads and stores had the wrong addressing mode specified: AddrMode5 instead of AddrMode5FP16, which turned out not be implemented at all, so that has also been added. This is the minimal patch that shows all the different moving parts. In patch 2/3 I will add some efficient lowering of bitcasts, and in 2/3 I will add the remaining Armv8.2-A FP16 instruction descriptions. Thanks to Sam Parker and Oliver Stannard for their help and reviews! Differential Revision: https://reviews.llvm.org/D38315 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@323512 91177308-0d34-0410-b5e6-96231b3b80d8 Sjoerd Meijer 2 years ago
11 changed file(s) with 242 addition(s) and 32 deletion(s). Raw diff Collapse all Expand all
24082408 NumBits = 8;
24092409 Scale = 4;
24102410 break;
2411 case ARMII::AddrMode5FP16:
2412 ImmIdx = FrameRegIdx+1;
2413 InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
2414 if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
2415 InstrOffs *= -1;
2416 NumBits = 8;
2417 Scale = 2;
2418 break;
24112419 default:
24122420 llvm_unreachable("Unsupported addressing mode!");
24132421 }
186186
187187 CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
188188 CCIfType<[f32], CCBitConvertToType>,
189
189190 CCDelegateTo
190191 ]>;
191192
232233 CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
233234 CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
234235 CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
235 S9, S10, S11, S12, S13, S14, S15]>>,
236 S9, S10, S11, S12, S13, S14, S15]>>,
236237 CCDelegateTo
237238 ]>;
238239
117117 SDValue &Offset, SDValue &Opc);
118118 bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
119119 SDValue &Offset, SDValue &Opc);
120 bool SelectAddrMode5(SDValue N, SDValue &Base,
121 SDValue &Offset);
120 bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
121 int Lwb, int Upb, bool FP16);
122 bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset);
123 bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset);
122124 bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
123125 bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
124126
885887 return true;
886888 }
887889
888 bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
889 SDValue &Base, SDValue &Offset) {
890 bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
891 int Lwb, int Upb, bool FP16) {
890892 if (!CurDAG->isBaseWithConstantOffset(N)) {
891893 Base = N;
892894 if (N.getOpcode() == ISD::FrameIndex) {
906908
907909 // If the RHS is +/- imm8, fold into addr mode.
908910 int RHSC;
909 if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
910 -256 + 1, 256, RHSC)) {
911 const int Scale = FP16 ? 2 : 4;
912
913 if (isScaledConstantInRange(N.getOperand(1), Scale, Lwb, Upb, RHSC)) {
911914 Base = N.getOperand(0);
912915 if (Base.getOpcode() == ISD::FrameIndex) {
913916 int FI = cast(Base)->getIndex();
920923 AddSub = ARM_AM::sub;
921924 RHSC = -RHSC;
922925 }
923 Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
926
927 if (FP16)
928 Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(AddSub, RHSC),
929 SDLoc(N), MVT::i32);
930 else
931 Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
932 SDLoc(N), MVT::i32);
933
934 return true;
935 }
936
937 Base = N;
938
939 if (FP16)
940 Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0),
924941 SDLoc(N), MVT::i32);
925 return true;
926 }
927
928 Base = N;
929 Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
930 SDLoc(N), MVT::i32);
942 else
943 Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
944 SDLoc(N), MVT::i32);
945
931946 return true;
947 }
948
949 bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
950 SDValue &Base, SDValue &Offset) {
951 int Lwb = -256 + 1;
952 int Upb = 256;
953 return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ false);
954 }
955
956 bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N,
957 SDValue &Base, SDValue &Offset) {
958 int Lwb = -512 + 1;
959 int Upb = 512;
960 return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ true);
932961 }
933962
934963 bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
521521 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
522522 }
523523
524 if (Subtarget->hasFullFP16()) {
525 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
526 // Clean up bitcast of incoming arguments if hard float abi is enabled.
527 if (Subtarget->isTargetHardFloat())
528 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
529 }
530
524531 for (MVT VT : MVT::vector_valuetypes()) {
525532 for (MVT InnerVT : MVT::vector_valuetypes()) {
526533 setTruncStoreAction(VT, InnerVT, Expand);
24732480 assert(VA.isRegLoc() && "Can only return in registers!");
24742481
24752482 SDValue Arg = OutVals[realRVLocIdx];
2483 bool ReturnF16 = false;
2484
2485 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
2486 // Half-precision return values can be returned like this:
2487 //
2488 // t11 f16 = fadd ...
2489 // t12: i16 = bitcast t11
2490 // t13: i32 = zero_extend t12
2491 // t14: f32 = bitcast t13
2492 //
2493 // to avoid code generation for bitcasts, we simply set Arg to the node
2494 // that produces the f16 value, t11 in this case.
2495 //
2496 if (Arg.getValueType() == MVT::f32) {
2497 SDValue ZE = Arg.getOperand(0);
2498 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2499 SDValue BC = ZE.getOperand(0);
2500 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2501 Arg = BC.getOperand(0);
2502 ReturnF16 = true;
2503 }
2504 }
2505 }
2506 }
24762507
24772508 switch (VA.getLocInfo()) {
24782509 default: llvm_unreachable("Unknown loc info!");
24792510 case CCValAssign::Full: break;
24802511 case CCValAssign::BCvt:
2481 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2512 if (!ReturnF16)
2513 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
24822514 break;
24832515 }
24842516
25262558 // Guarantee that all emitted copies are
25272559 // stuck together, avoiding something bad.
25282560 Flag = Chain.getValue(1);
2529 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2561 RetOps.push_back(DAG.getRegister(VA.getLocReg(),
2562 ReturnF16 ? MVT::f16 : VA.getLocVT()));
25302563 }
25312564 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
25322565 const MCPhysReg *I =
36833716 } else {
36843717 const TargetRegisterClass *RC;
36853718
3686 if (RegVT == MVT::f32)
3719
3720 if (RegVT == MVT::f16)
3721 RC = &ARM::HPRRegClass;
3722 else if (RegVT == MVT::f32)
36873723 RC = &ARM::SPRRegClass;
36883724 else if (RegVT == MVT::f64)
36893725 RC = &ARM::DPRRegClass;
50235059 // source or destination of the bit convert.
50245060 EVT SrcVT = Op.getValueType();
50255061 EVT DstVT = N->getValueType(0);
5062
5063 // Half-precision arguments can be passed in like this:
5064 //
5065 // t4: f32,ch = CopyFromReg t0, Register:f32 %1
5066 // t8: i32 = bitcast t4
5067 // t9: i16 = truncate t8
5068 // t10: f16 = bitcast t9 <~~~~ SDNode N
5069 //
5070 // but we want to avoid code generation for the bitcast, so transform this
5071 // into:
5072 //
5073 // t18: f16 = CopyFromReg t0, Register:f32 %0
5074 //
5075 if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
5076 if (Op.getOpcode() != ISD::TRUNCATE)
5077 return SDValue();
5078
5079 SDValue Bitcast = Op.getOperand(0);
5080 if (Bitcast.getOpcode() != ISD::BITCAST ||
5081 Bitcast.getValueType() != MVT::i32)
5082 return SDValue();
5083
5084 SDValue Copy = Bitcast.getOperand(0);
5085 if (Copy.getOpcode() != ISD::CopyFromReg ||
5086 Copy.getValueType() != MVT::f32)
5087 return SDValue();
5088
5089 SDValue Ops[] = { Copy->getOperand(0), Copy->getOperand(1) };
5090 return DAG.getNode(ISD::CopyFromReg, SDLoc(Copy), MVT::f16, Ops);
5091 }
5092
50265093 assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
50275094 "ExpandBITCAST called for non-i64 type");
50285095
107107 def AddrModeT2_pc : AddrMode<14>;
108108 def AddrModeT2_i8s4 : AddrMode<15>;
109109 def AddrMode_i12 : AddrMode<16>;
110 def AddrMode5FP16 : AddrMode<17>;
110111
111112 // Load / store index mode.
112113 class IndexMode val> {
15261527 class AHI5 opcod1, bits<2> opcod2, dag oops, dag iops,
15271528 InstrItinClass itin,
15281529 string opc, string asm, list pattern>
1529 : VFPI, 4, IndexModeNone,
1530 : VFPIFP16, 4, IndexModeNone,
15301531 VFPLdStFrm, itin, opc, asm, "", pattern> {
15311532 list Predicates = [HasFullFP16];
15321533
6868 let ParserMatchClass = FPImmOperand;
6969 }
7070
71 def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
72 return cast(N)->getAlignment() >= 2;
73 }]>;
74
7175 def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
7276 return cast(N)->getAlignment() >= 4;
77 }]>;
78
79 def alignedstore16 : PatFrag<(ops node:$val, node:$ptr),
80 (store node:$val, node:$ptr), [{
81 return cast(N)->getAlignment() >= 2;
7382 }]>;
7483
7584 def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
112121 let D = VFPNeonDomain;
113122 }
114123
115 def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
124 def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
116125 IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
117 []>,
126 [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>,
118127 Requires<[HasFullFP16]>;
119128
120129 } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
131140 let D = VFPNeonDomain;
132141 }
133142
134 def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
143 def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
135144 IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
136 []>,
145 [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>,
137146 Requires<[HasFullFP16]>;
138147
139148 //===----------------------------------------------------------------------===//
334343
335344 let TwoOperandAliasConstraint = "$Sn = $Sd" in
336345 def VADDH : AHbI<0b11100, 0b11, 0, 0,
337 (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
346 (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
338347 IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
339 []>,
348 [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>,
340349 Sched<[WriteFPALU32]>;
341350
342351 let TwoOperandAliasConstraint = "$Dn = $Dd" in
359368
360369 let TwoOperandAliasConstraint = "$Sn = $Sd" in
361370 def VSUBH : AHbI<0b11100, 0b11, 1, 0,
362 (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
371 (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
363372 IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
364 []>,
373 [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>,
365374 Sched<[WriteFPALU32]>;
366375
367376 let TwoOperandAliasConstraint = "$Dn = $Dd" in
657666 let Predicates = [HasVFP2, HasDPVFP];
658667 }
659668
660 // Between half, single and double-precision. For disassembly only.
661
669 // Between half, single and double-precision.
662670 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
663671 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
664 [/* For disassembly only; pattern left blank */]>,
672 [ /* intentionally left blank, see rule below */ ]>,
665673 Requires<[HasFP16]>,
666674 Sched<[WriteFPCVT]>;
667675
676 def : Pat<(f32 (fpextend HPR:$Sm)),
677 (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
678
668679 def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
669680 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
670 [/* For disassembly only; pattern left blank */]>,
681 []>,
671682 Requires<[HasFP16]>,
672683 Sched<[WriteFPCVT]>;
673684
306306 let DiagnosticString = "operand must be a register in range [s0, s31]";
307307 }
308308
309 def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
310 let AltOrders = [(add (decimate HPR, 2), SPR),
311 (add (decimate HPR, 4),
312 (decimate HPR, 2),
313 (decimate (rotl HPR, 1), 4),
314 (decimate (rotl HPR, 1), 2))];
315 let AltOrderSelect = [{
316 return 1 + MF.getSubtarget().useStride4VFPs(MF);
317 }];
318 let DiagnosticString = "operand must be a register in range [s0, s31]";
319 }
320
309321 // Subset of SPR which can be used as a source of NEON scalars for 16-bit
310322 // operations
311323 def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> {
156156 static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
157157 uint64_t Address, const void *Decoder);
158158 static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
159 uint64_t Address, const void *Decoder);
160 static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
159161 uint64_t Address, const void *Decoder);
160162 static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
161163 uint64_t Address, const void *Decoder);
993995 unsigned Register = SPRDecoderTable[RegNo];
994996 Inst.addOperand(MCOperand::createReg(Register));
995997 return MCDisassembler::Success;
998 }
999
1000 static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
1001 uint64_t Address, const void *Decoder) {
1002 return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
9961003 }
9971004
9981005 static const uint16_t DPRDecoderTable[] = {
185185 AddrModeT2_so = 13,
186186 AddrModeT2_pc = 14, // +/- i12 for pc relative data
187187 AddrModeT2_i8s4 = 15, // i8 * 4
188 AddrMode_i12 = 16
188 AddrMode_i12 = 16,
189 AddrMode5FP16 = 17 // i8 * 2
189190 };
190191
191192 inline static const char *AddrModeToString(AddrMode addrmode) {
196197 case AddrMode3: return "AddrMode3";
197198 case AddrMode4: return "AddrMode4";
198199 case AddrMode5: return "AddrMode5";
200 case AddrMode5FP16: return "AddrMode5FP16";
199201 case AddrMode6: return "AddrMode6";
200202 case AddrModeT1_1: return "AddrModeT1_1";
201203 case AddrModeT1_2: return "AddrModeT1_2";
4242 }
4343
4444 define half @test_half(half %a, half %b) {
45 ; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)*
45 ; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* (in function: test_half)
4646 ; CHECK-LABEL: warning: Instruction selection used fallback path for test_half
4747 %res = fadd half %a, %b
4848 ret half %res
0 ; SOFT:
1 ; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT
2
3 ; SOFTFP:
4 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefix=CHECK-SOFTFP-VFP3
5 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefix=CHECK-SOFTFP-FP16
6 ; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-SOFTFP-FULLFP16
7
8 ; HARD:
9 ; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefix=CHECK-HARDFP-VFP3
10 ; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefix=CHECK-HARDFP-FP16
11 ; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-HARDFP-FULLFP16
12
13 define float @Add(float %a.coerce, float %b.coerce) local_unnamed_addr {
14 entry:
15 %0 = bitcast float %a.coerce to i32
16 %tmp.0.extract.trunc = trunc i32 %0 to i16
17 %1 = bitcast i16 %tmp.0.extract.trunc to half
18 %2 = bitcast float %b.coerce to i32
19 %tmp1.0.extract.trunc = trunc i32 %2 to i16
20 %3 = bitcast i16 %tmp1.0.extract.trunc to half
21 %add = fadd half %1, %3
22 %4 = bitcast half %add to i16
23 %tmp4.0.insert.ext = zext i16 %4 to i32
24 %5 = bitcast i32 %tmp4.0.insert.ext to float
25 ret float %5
26
27 ; CHECK-SOFT: bl __aeabi_h2f
28 ; CHECK-SOFT: bl __aeabi_h2f
29 ; CHECK-SOFT: bl __aeabi_fadd
30 ; CHECK-SOFT: bl __aeabi_f2h
31
32 ; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
33 ; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
34 ; CHECK-SOFTFP-VFP3: vadd.f32
35 ; CHECK-SOFTFP-VFP3: bl __aeabi_f2h
36
37 ; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1
38 ; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0
39 ; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]]
40 ; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]]
41 ; CHECK-SOFTFP-FP16: vadd.f32 [[S0]], [[S0]], [[S2]]
42 ; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
43 ; CHECK-SOFTFP-FP16: vmov r0, s0
44
45 ; CHECK-SOFTFP-FULLFP16: strh r1, {{.*}}
46 ; CHECK-SOFTFP-FULLFP16: strh r0, {{.*}}
47 ; CHECK-SOFTFP-FULLFP16: vldr.16 [[S0:s[0-9]]], {{.*}}
48 ; CHECK-SOFTFP-FULLFP16: vldr.16 [[S2:s[0-9]]], {{.*}}
49 ; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]]
50 ; CHECK-SOFTFP-FULLFP16: vstr.16 [[S2:s[0-9]]], {{.*}}
51 ; CHECK-SOFTFP-FULLFP16: ldrh r0, {{.*}}
52 ; CHECK-SOFTFP-FULLFP16: mov pc, lr
53
54 ; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
55 ; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
56 ; CHECK-HARDFP-VFP3: bl __aeabi_h2f
57 ; CHECK-HARDFP-VFP3: bl __aeabi_h2f
58 ; CHECK-HARDFP-VFP3: vadd.f32
59 ; CHECK-HARDFP-VFP3: bl __aeabi_f2h
60 ; CHECK-HARDFP-VFP3: vmov s0, r0
61
62 ; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1
63 ; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0
64 ; CHECK-HARDFP-FP16: vadd.f32 [[S0]], [[S0]], [[S2]]
65 ; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
66
67 ; CHECK-HARDFP-FULLFP16: vadd.f16 s0, s0, s1
68 ; CHECK-HARDFP-FULLFP16-NEXT: mov pc, lr
69
70 }
71