llvm.org GIT mirror llvm / 9027123
[AMDGPU] Add f16 support (VI+) Differential Revision: https://reviews.llvm.org/D25975 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286753 91177308-0d34-0410-b5e6-96231b3b80d8 Konstantin Zhuravlyov 3 years ago
60 changed file(s) with 4744 addition(s) and 279 deletion(s). Raw diff Collapse all Expand all
185185 //===------------------------------------------------------------===//
186186 // Subtarget Features (options and debugging)
187187 //===------------------------------------------------------------===//
188
189 def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
190 "FP16Denormals",
191 "true",
192 "Enable half precision denormal handling"
193 >;
188194
189195 // Some instructions do not support denormals despite this flag. Using
190196 // fp32 denormals also causes instructions to run at the double
562562
563563 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
564564 assert(VT.isFloatingPoint());
565 return VT == MVT::f32 || VT == MVT::f64;
565 return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
566 VT == MVT::f16);
566567 }
567568
568569 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
19261927 assert(Op.getOperand(0).getValueType() == MVT::i64 &&
19271928 "operation should be legal");
19281929
1930 // TODO: Factor out code common with LowerSINT_TO_FP.
1931
19291932 EVT DestVT = Op.getValueType();
1933 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
1934 SDLoc DL(Op);
1935 SDValue Src = Op.getOperand(0);
1936
1937 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
1938 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
1939 SDValue FPRound =
1940 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
1941
1942 return FPRound;
1943 }
19301944
19311945 if (DestVT == MVT::f32)
19321946 return LowerINT_TO_FP32(Op, DAG, false);
19401954 assert(Op.getOperand(0).getValueType() == MVT::i64 &&
19411955 "operation should be legal");
19421956
1957 // TODO: Factor out code common with LowerUINT_TO_FP.
1958
19431959 EVT DestVT = Op.getValueType();
1960 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
1961 SDLoc DL(Op);
1962 SDValue Src = Op.getOperand(0);
1963
1964 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
1965 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
1966 SDValue FPRound =
1967 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
1968
1969 return FPRound;
1970 }
1971
19441972 if (DestVT == MVT::f32)
19451973 return LowerINT_TO_FP32(Op, DAG, true);
19461974
20762104 SelectionDAG &DAG) const {
20772105 SDValue Src = Op.getOperand(0);
20782106
2107 // TODO: Factor out code common with LowerFP_TO_UINT.
2108
2109 EVT SrcVT = Src.getValueType();
2110 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2111 SDLoc DL(Op);
2112
2113 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2114 SDValue FpToInt32 =
2115 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2116
2117 return FpToInt32;
2118 }
2119
20792120 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
20802121 return LowerFP64_TO_INT(Op, DAG, true);
20812122
20852126 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
20862127 SelectionDAG &DAG) const {
20872128 SDValue Src = Op.getOperand(0);
2129
2130 // TODO: Factor out code common with LowerFP_TO_SINT.
2131
2132 EVT SrcVT = Src.getValueType();
2133 if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2134 SDLoc DL(Op);
2135
2136 SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2137 SDValue FpToInt32 =
2138 DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2139
2140 return FpToInt32;
2141 }
20882142
20892143 if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
20902144 return LowerFP64_TO_INT(Op, DAG, false);
4141 field bits<32> Inst = 0xffffffff;
4242 }
4343
44 def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">;
4445 def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">;
4546 def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">;
4647 def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
5555 // denormals, but should be checked. Should we issue a warning somewhere
5656 // if someone tries to enable these?
5757 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
58 FP16Denormals = false;
5859 FP32Denormals = false;
5960 FP64Denormals = false;
6061 }
8081 FastFMAF32(false),
8182 HalfRate64Ops(false),
8283
84 FP16Denormals(false),
8385 FP32Denormals(false),
8486 FP64Denormals(false),
8587 FPExceptions(false),
7474 bool HalfRate64Ops;
7575
7676 // Dynamially set bits that enable features.
77 bool FP16Denormals;
7778 bool FP32Denormals;
7879 bool FP64Denormals;
7980 bool FPExceptions;
269270 /// the given LDS memory size is the only constraint.
270271 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
271272
273 bool hasFP16Denormals() const {
274 return FP16Denormals;
275 }
272276
273277 bool hasFP32Denormals() const {
274278 return FP32Denormals;
13671367 getForcedEncodingSize() != 64)
13681368 return Match_PreferE32;
13691369
1370 if (Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
1371 Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa) {
1370 if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa ||
1371 Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa) {
13721372 // v_mac_f32/16 allow only dst_sel == DWORD;
1373 auto OpNum = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel);
1373 auto OpNum =
1374 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::dst_sel);
13741375 const auto &Op = Inst.getOperand(OpNum);
13751376 if (!Op.isImm() || Op.getImm() != AMDGPU::SDWA::SdwaSel::DWORD) {
13761377 return Match_InvalidOperand;
27132714 addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
27142715 addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
27152716
2716 // special case v_mac_f32:
2717 // special case v_mac_{f16, f32}:
27172718 // it has src2 register operand that is tied to dst operand
27182719 // we don't allow modifiers for this operand in assembler so src2_modifiers
27192720 // should be 0
27202721 if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
2721 Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi) {
2722 Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
2723 Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) {
27222724 auto it = Inst.begin();
2723 std::advance(it, AMDGPU::getNamedOperandIdx(AMDGPU::V_MAC_F32_e64, AMDGPU::OpName::src2_modifiers));
2725 std::advance(
2726 it,
2727 AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ?
2728 AMDGPU::V_MAC_F16_e64 :
2729 AMDGPU::V_MAC_F32_e64,
2730 AMDGPU::OpName::src2_modifiers));
27242731 it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
27252732 ++it;
27262733 Inst.insert(it, Inst.getOperand(0)); // src2 = dst
28952902 addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
28962903 addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
28972904
2898 // special case v_mac_f32:
2905 // special case v_mac_{f16, f32}:
28992906 // it has src2 register operand that is tied to dst operand
2900 if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp) {
2907 if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp ||
2908 Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) {
29012909 auto it = Inst.begin();
2902 std::advance(it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
2910 std::advance(
2911 it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
29032912 Inst.insert(it, Inst.getOperand(0)); // src2 = dst
29042913 }
29052914 }
30393048 }
30403049 }
30413050
3042 // special case v_mac_f32:
3051 // special case v_mac_{f16, f32}:
30433052 // it has src2 register operand that is tied to dst operand
3044 if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa) {
3053 if (Inst.getOpcode() == AMDGPU::V_MAC_F32_sdwa ||
3054 Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa) {
30453055 auto it = Inst.begin();
3046 std::advance(it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
3056 std::advance(
3057 it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
30473058 Inst.insert(it, Inst.getOperand(0)); // src2 = dst
30483059 }
30493060
155155 const SIInstrInfo *TII) {
156156 if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
157157
158 // Special case for v_mac_f32_e64 if we are trying to fold into src2
158 // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
159159 unsigned Opc = MI->getOpcode();
160 if (Opc == AMDGPU::V_MAC_F32_e64 &&
160 if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) &&
161161 (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
162 // Check if changing this to a v_mad_f32 instruction will allow us to
163 // fold the operand.
164 MI->setDesc(TII->get(AMDGPU::V_MAD_F32));
162 bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
163
164 // Check if changing this to a v_mad_{f16, f32} instruction will allow us
165 // to fold the operand.
166 MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16));
165167 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII);
166168 if (FoldAsMAD) {
167169 MI->untieRegOperand(OpNo);
238240 // make sense. e.g. don't fold:
239241 //
240242 // %vreg1 = COPY %vreg0:sub1
241 // %vreg2 = V_MAC_F32 %vreg3, %vreg4, %vreg1
243 // %vreg2 = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1
242244 //
243245 // into
244 // %vreg2 = V_MAC_F32 %vreg3, %vreg4, %vreg0:sub1
246 // %vreg2 = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1
245247 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
246248 return;
247249 }
7777 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
7878 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
7979
80 if (Subtarget->has16BitInsts())
80 if (Subtarget->has16BitInsts()) {
8181 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
82 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
83 }
8284
8385 computeRegisterProperties(STI.getRegisterInfo());
8486
262264
263265 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
264266
265 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
266 AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32);
267 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
268 AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32);
269267 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
270268 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
271269 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
272270 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
273271
274 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
275 AddPromotedToType(ISD::FP_TO_SINT, MVT::i16, MVT::i32);
276
277 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
278 AddPromotedToType(ISD::FP_TO_UINT, MVT::i16, MVT::i32);
272 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
273 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);
274 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
275 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);
276
277 // F16 - Constant Actions.
278 setOperationAction(ISD::ConstantFP, MVT::f16, Custom);
279
280 // F16 - Load/Store Actions.
281 setOperationAction(ISD::LOAD, MVT::f16, Promote);
282 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
283 setOperationAction(ISD::STORE, MVT::f16, Promote);
284 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
285
286 // F16 - VOP1 Actions.
287 setOperationAction(ISD::FCOS, MVT::f16, Promote);
288 setOperationAction(ISD::FSIN, MVT::f16, Promote);
289
290 // F16 - VOP2 Actions.
291 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
292 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
293 setOperationAction(ISD::FDIV, MVT::f16, Promote);
294
295 // F16 - VOP3 Actions.
296 setOperationAction(ISD::FMA, MVT::f16, Legal);
297 if (!Subtarget->hasFP16Denormals())
298 setOperationAction(ISD::FMAD, MVT::f16, Legal);
279299 }
280300
281301 setTargetDAGCombine(ISD::FADD);
640660 return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
641661 DAG.getConstant(Offset, SL, PtrVT));
642662 }
663
643664 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
644665 const SDLoc &SL, SDValue Chain,
645666 unsigned Offset, bool Signed) const {
658679
659680 SDValue Val;
660681 if (MemVT.isFloatingPoint())
661 Val = DAG.getNode(ISD::FP_EXTEND, SL, VT, Load);
682 Val = getFPExtOrFPTrunc(DAG, Load, SL, VT);
662683 else if (Signed)
663684 Val = DAG.getSExtOrTrunc(Load, SL, VT);
664685 else
18011822 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
18021823 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
18031824 case ISD::TRAP: return lowerTRAP(Op, DAG);
1825
1826 case ISD::ConstantFP:
1827 return lowerConstantFP(Op, DAG);
1828 case ISD::FP_TO_SINT:
1829 case ISD::FP_TO_UINT:
1830 return lowerFpToInt(Op, DAG);
1831 case ISD::SINT_TO_FP:
1832 case ISD::UINT_TO_FP:
1833 return lowerIntToFp(Op, DAG);
18041834 }
18051835 return SDValue();
18061836 }
19922022 Intr->getOperand(0));
19932023
19942024 return Chain;
2025 }
2026
2027 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
2028 SDValue Op,
2029 const SDLoc &DL,
2030 EVT VT) const {
2031 return Op.getValueType().bitsLE(VT) ?
2032 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
2033 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
2034 }
2035
2036 SDValue SITargetLowering::lowerConstantFP(SDValue Op, SelectionDAG &DAG) const {
2037 if (ConstantFPSDNode *FP = dyn_cast(Op)) {
2038 return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(),
2039 SDLoc(Op), MVT::i32);
2040 }
2041
2042 return SDValue();
2043 }
2044
2045 SDValue SITargetLowering::lowerFpToInt(SDValue Op, SelectionDAG &DAG) const {
2046 EVT DstVT = Op.getValueType();
2047 EVT SrcVT = Op.getOperand(0).getValueType();
2048 if (DstVT == MVT::i64) {
2049 return Op.getOpcode() == ISD::FP_TO_SINT ?
2050 AMDGPUTargetLowering::LowerFP_TO_SINT(Op, DAG) :
2051 AMDGPUTargetLowering::LowerFP_TO_UINT(Op, DAG);
2052 }
2053
2054 if (SrcVT == MVT::f16)
2055 return Op;
2056
2057 SDLoc DL(Op);
2058 SDValue OrigSrc = Op.getOperand(0);
2059 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, DL);
2060 SDValue FPRoundSrc =
2061 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, OrigSrc, FPRoundFlag);
2062
2063 return DAG.getNode(Op.getOpcode(), DL, DstVT, FPRoundSrc);
2064 }
2065
2066 SDValue SITargetLowering::lowerIntToFp(SDValue Op, SelectionDAG &DAG) const {
2067 EVT DstVT = Op.getValueType();
2068 EVT SrcVT = Op.getOperand(0).getValueType();
2069 if (SrcVT == MVT::i64) {
2070 return Op.getOpcode() == ISD::SINT_TO_FP ?
2071 AMDGPUTargetLowering::LowerSINT_TO_FP(Op, DAG) :
2072 AMDGPUTargetLowering::LowerUINT_TO_FP(Op, DAG);
2073 }
2074
2075 if (DstVT == MVT::f16)
2076 return Op;
2077
2078 SDLoc DL(Op);
2079 SDValue OrigSrc = Op.getOperand(0);
2080 SDValue SExtOrZExtOrTruncSrc = Op.getOpcode() == ISD::SINT_TO_FP ?
2081 DAG.getSExtOrTrunc(OrigSrc, DL, MVT::i32) :
2082 DAG.getZExtOrTrunc(OrigSrc, DL, MVT::i32);
2083
2084 return DAG.getNode(Op.getOpcode(), DL, DstVT, SExtOrZExtOrTruncSrc);
19952085 }
19962086
19972087 SDValue SITargetLowering::getSegmentAperture(unsigned AS,
35613651 SDValue RHS = N->getOperand(1);
35623652 EVT VT = LHS.getValueType();
35633653
3564 if (VT != MVT::f32 && VT != MVT::f64)
3654 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
3655 VT != MVT::f16))
35653656 return SDValue();
35663657
35673658 // Match isinf pattern
37053796 //
37063797 // Only do this if we are not trying to support denormals. v_mad_f32 does
37073798 // not support denormals ever.
3708 if (VT == MVT::f32 &&
3709 !Subtarget->hasFP32Denormals()) {
3799 if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) {
37103800 SDValue LHS = N->getOperand(0);
37113801 SDValue RHS = N->getOperand(1);
37123802 if (LHS.getOpcode() == ISD::FADD) {
4444 SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
4545 SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
4646 SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
47
48 /// \brief Converts \p Op, which must be of floating point type, to the
49 /// floating point type \p VT, by either extending or truncating it.
50 SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
51 SDValue Op,
52 const SDLoc &DL,
53 EVT VT) const;
54
55 /// \brief Custom lowering for ISD::ConstantFP.
56 SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
57
58 /// \brief Custom lowering for ISD::FP_TO_SINT, ISD::FP_TO_UINT.
59 SDValue lowerFpToInt(SDValue Op, SelectionDAG &DAG) const;
60
61 /// \brief Custom lowering for ISD::SINT_TO_FP, ISD::UINT_TO_FP.
62 SDValue lowerIntToFp(SDValue Op, SelectionDAG &DAG) const;
4763
4864 SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
4965 SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
13851385 return true;
13861386 }
13871387
1388 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {
1388 if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
1389 Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
1390 bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
1391
13891392 // Don't fold if we are using source modifiers. The new VOP2 instructions
13901393 // don't have them.
13911394 if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
14061409 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
14071410 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
14081411
1409 // Multiplied part is the constant: Use v_madmk_f32
1412 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
14101413 // We should only expect these to be on src0 due to canonicalizations.
14111414 if (Src0->isReg() && Src0->getReg() == Reg) {
14121415 if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
14341437 Src0->setSubReg(Src1SubReg);
14351438 Src0->setIsKill(Src1->isKill());
14361439
1437 if (Opc == AMDGPU::V_MAC_F32_e64) {
1440 if (Opc == AMDGPU::V_MAC_F32_e64 ||
1441 Opc == AMDGPU::V_MAC_F16_e64)
14381442 UseMI.untieRegOperand(
14391443 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1440 }
14411444
14421445 Src1->ChangeToImmediate(Imm);
14431446
14441447 removeModOperands(UseMI);
1445 UseMI.setDesc(get(AMDGPU::V_MADMK_F32));
1448 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
14461449
14471450 bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
14481451 if (DeleteDef)
14511454 return true;
14521455 }
14531456
1454 // Added part is the constant: Use v_madak_f32
1457 // Added part is the constant: Use v_madak_{f16, f32}.
14551458 if (Src2->isReg() && Src2->getReg() == Reg) {
14561459 // Not allowed to use constant bus for another operand.
14571460 // We can however allow an inline immediate as src0.
14731476 UseMI.RemoveOperand(
14741477 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
14751478
1476 if (Opc == AMDGPU::V_MAC_F32_e64) {
1479 if (Opc == AMDGPU::V_MAC_F32_e64 ||
1480 Opc == AMDGPU::V_MAC_F16_e64)
14771481 UseMI.untieRegOperand(
14781482 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1479 }
14801483
14811484 // ChangingToImmediate adds Src2 back to the instruction.
14821485 Src2->ChangeToImmediate(Imm);
14831486
14841487 // These come before src2.
14851488 removeModOperands(UseMI);
1486 UseMI.setDesc(get(AMDGPU::V_MADAK_F32));
1489 UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
14871490
14881491 bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
14891492 if (DeleteDef)
15921595 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
15931596 MachineInstr &MI,
15941597 LiveVariables *LV) const {
1598 bool IsF16 = false;
15951599
15961600 switch (MI.getOpcode()) {
15971601 default:
15981602 return nullptr;
1603 case AMDGPU::V_MAC_F16_e64:
1604 IsF16 = true;
15991605 case AMDGPU::V_MAC_F32_e64:
16001606 break;
1607 case AMDGPU::V_MAC_F16_e32:
1608 IsF16 = true;
16011609 case AMDGPU::V_MAC_F32_e32: {
16021610 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
16031611 if (Src0->isImm() && !isInlineConstant(*Src0, 4))
16111619 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
16121620 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
16131621
1614 return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32))
1622 return BuildMI(*MBB, MI, MI.getDebugLoc(),
1623 get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
16151624 .addOperand(*Dst)
16161625 .addImm(0) // Src0 mods
16171626 .addOperand(*Src0)
938938 let HasExt = 0;
939939 }
940940
941 // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order
942 // for the instruction patterns to work.
943941 def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
944 def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>;
945 def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>;
942 def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
943 def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
946944
947945 def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
948 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>;
946 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
947 def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
949948 def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
950949
951950 def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
959958 def VOP_F64_F32 : VOPProfile <[f64, f32, untyped, untyped]>;
960959 def VOP_F64_F64 : VOPProfile <[f64, f64, untyped, untyped]>;
961960 def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
961 def VOP_I32_F16 : VOPProfile <[i32, f16, untyped, untyped]>;
962962 def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
963963 def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
964964 def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
965965
966 def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
966967 def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
967968 def VOP_F32_F32_I32 : VOPProfile <[f32, f32, i32, untyped]>;
968969 def VOP_F64_F64_F64 : VOPProfile <[f64, f64, f64, untyped]>;
975976 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
976977 def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>;
977978
979 def VOP_F16_F32_F16_F32 : VOPProfile <[f16, f32, f16, f32]>;
980 def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>;
978981 def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
979982 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
980983 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
412412
413413 } // End Predicates = [UnsafeFPMath]
414414
415 def : Pat <
416 (f32 (fpextend f16:$src)),
417 (V_CVT_F32_F16_e32 $src)
418 >;
419
420 def : Pat <
421 (f64 (fpextend f16:$src)),
422 (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
423 >;
424
425 def : Pat <
426 (f16 (fpround f32:$src)),
427 (V_CVT_F16_F32_e32 $src)
428 >;
429
430 def : Pat <
431 (f16 (fpround f64:$src)),
432 (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
433 >;
434
435 def : Pat <
436 (i32 (fp_to_sint f16:$src)),
437 (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
438 >;
439
440 def : Pat <
441 (i32 (fp_to_uint f16:$src)),
442 (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
443 >;
444
445 def : Pat <
446 (f16 (sint_to_fp i32:$src)),
447 (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
448 >;
449
450 def : Pat <
451 (f16 (uint_to_fp i32:$src)),
452 (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
453 >;
454
415455 //===----------------------------------------------------------------------===//
416456 // VOP2 Patterns
417457 //===----------------------------------------------------------------------===//
426466 (V_CNDMASK_B32_e64 $src2, $src1, $src0)
427467 >;
428468
469 // Pattern for V_MAC_F16
470 def : Pat <
471 (f16 (fmad (VOP3NoMods0 f16:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
472 (VOP3NoMods f16:$src1, i32:$src1_modifiers),
473 (VOP3NoMods f16:$src2, i32:$src2_modifiers))),
474 (V_MAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
475 $src2_modifiers, $src2, $clamp, $omod)
476 >;
477
429478 // Pattern for V_MAC_F32
430479 def : Pat <
431 (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
432 (VOP3NoMods f32:$src1, i32:$src1_modifiers),
433 (VOP3NoMods f32:$src2, i32:$src2_modifiers)),
480 (f32 (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
481 (VOP3NoMods f32:$src1, i32:$src1_modifiers),
482 (VOP3NoMods f32:$src2, i32:$src2_modifiers))),
434483 (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
435484 $src2_modifiers, $src2, $clamp, $omod)
436485 >;
505554
506555 // FIXME: Why do only some of these type combinations for SReg and
507556 // VReg?
557 // 16-bit bitcast
558 def : BitConvert ;
559 def : BitConvert ;
560 def : BitConvert ;
561 def : BitConvert ;
562
508563 // 32-bit bitcast
509564 def : BitConvert ;
510565 def : BitConvert ;
122122 // TODO: Do we need to set DwarfRegAlias on register tuples?
123123
124124 // SGPR 32-bit registers
125 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
125 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
126126 (add (sequence "SGPR%u", 0, 103))> {
127127 let AllocationPriority = 1;
128128 }
189189 (add (decimate (shl TTMP_32, 3), 4))]>;
190190
191191 // VGPR 32-bit registers
192 // i16 only on VI+
193 def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
192 def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
194193 (add (sequence "VGPR%u", 0, 255))> {
195194 let AllocationPriority = 1;
196195 let Size = 32;
251250
252251 // Subset of SReg_32 without M0 for SMRD instructions and alike.
253252 // See comments in SIInstructions.td for more info.
254 def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32,
253 def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
255254 (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
256255 TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
257256 let AllocationPriority = 1;
258257 }
259258
260259 // Register class for all scalar registers (SGPRs + Special Registers)
261 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
260 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
262261 (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> {
263262 let AllocationPriority = 1;
264263 }
346345 let Size = 32;
347346 }
348347
349 def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> {
348 def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
349 (add VGPR_32, SReg_32)> {
350350 let isAllocatable = 0;
351351 }
352352
9090 default: return false;
9191
9292 case AMDGPU::V_MAC_F32_e64:
93 case AMDGPU::V_MAC_F16_e64:
9394 if (!isVGPR(Src2, TRI, MRI) ||
9495 TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
9596 return false;
279279
280280 let SubtargetPredicate = isVI in {
281281
282 defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16>;
283 defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16>;
284 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16>;
285 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16>;
286 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16>;
287 defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16>;
288 defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16>;
289 defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16>;
290 defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16>;
291 defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16>;
292 defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16>;
293 defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16>;
294 defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16>;
295 defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16>;
296 defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16>;
297 defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16>;
298 defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16>;
299 defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16>;
282 defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
283 defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
284 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
285 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
286 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
287 defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
288 defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
289 defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
290 defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
291 defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
292 defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I32_F16, int_amdgcn_frexp_exp>;
293 defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
294 defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
295 defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
296 defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
297 defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
298 defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
299 defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
300300
301301 }
302302
132132 }
133133 }
134134
135 def VOP_MADAK : VOPProfile <[f32, f32, f32, f32]> {
135 class VOP_MADAK : VOPProfile <[vt, vt, vt, vt]> {
136136 field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, f32kimm:$imm);
137137 field string Asm32 = "$vdst, $src0, $src1, $imm";
138138 field bit HasExt = 0;
139139 }
140140
141 def VOP_MADMK : VOPProfile <[f32, f32, f32, f32]> {
141 def VOP_MADAK_F16 : VOP_MADAK ;
142 def VOP_MADAK_F32 : VOP_MADAK ;
143
144 class VOP_MADMK : VOPProfile <[vt, vt, vt, vt]> {
142145 field dag Ins32 = (ins VCSrc_f32:$src0, f32kimm:$imm, VGPR_32:$src1);
143146 field string Asm32 = "$vdst, $src0, $imm, $src1";
144147 field bit HasExt = 0;
145148 }
146149
147 def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> {
150 def VOP_MADMK_F16 : VOP_MADMK ;
151 def VOP_MADMK_F32 : VOP_MADMK ;
152
153 class VOP_MAC : VOPProfile <[vt, vt, vt, vt]> {
148154 let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
149155 let Ins64 = getIns64, 3,
150156 HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
158164 VGPR_32:$src2, // stub argument
159165 clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
160166 src0_sel:$src0_sel, src1_sel:$src1_sel);
161 let Asm32 = getAsm32<1, 2, f32>.ret;
162 let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
163 let AsmDPP = getAsmDPP<1, 2, HasModifiers, f32>.ret;
164 let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, f32>.ret;
167 let Asm32 = getAsm32<1, 2, vt>.ret;
168 let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
169 let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
165170 let HasSrc2 = 0;
166171 let HasSrc2Mods = 0;
167172 let HasExt = 1;
173 }
174
175 def VOP_MAC_F16 : VOP_MAC {
176 // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
177 // 'not a string initializer' error.
178 let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret;
179 }
180
181 def VOP_MAC_F32 : VOP_MAC {
182 // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
183 // 'not a string initializer' error.
184 let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
168185 }
169186
170187 // Write out to vcc or arbitrary SGPR.
232249 let SubtargetPredicate = isGCN in {
233250
234251 defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
235 def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK>;
252 def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
236253
237254 let isCommutable = 1 in {
238255 defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
259276
260277 let Constraints = "$vdst = $src2", DisableEncoding="$src2",
261278 isConvertibleToThreeAddress = 1 in {
262 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC>;
263 }
264
265 def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK>;
279 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
280 }
281
282 def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
266283
267284 // No patterns so that the scalar instructions are always selected.
268285 // The scalar versions will be replaced with vector when needed later.
317334
318335 let SubtargetPredicate = isVI in {
319336
320 def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK>;
337 def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
321338 defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
322339 defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
323340 defm V_ASHRREV_B16 : VOP2Inst <"v_ashrrev_b16", VOP_I16_I16_I16>;
324 defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I16>;
341 defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
325342
326343 let isCommutable = 1 in {
327 defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16>;
328 defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16>;
344 defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
345 defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
329346 defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
330 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16>;
331 defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_F16_F16_F16>;
332 def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK>;
347 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
348 def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
333349 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
334350 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
335351 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16>;
336352 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
337 defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16>;
338 defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16>;
353 defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
354 defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
339355 defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
340356 defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
341357 defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
342358 defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
359
360 let Constraints = "$vdst = $src2", DisableEncoding="$src2",
361 isConvertibleToThreeAddress = 1 in {
362 defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
363 }
343364 } // End isCommutable = 1
344365
345366 } // End SubtargetPredicate = isVI
214214 let SubtargetPredicate = isVI in {
215215
216216 let isCommutable = 1 in {
217 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile>;
218 def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>;
219 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>;
220 }
217
218 def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile, AMDGPUdiv_fixup>;
219 def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile, fma>;
220 def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile>;
221 def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile>;
222 def V_INTERP_P2_F16 : VOP3Inst <"v_interp_p2_f16", VOP3_Profile>;
223 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>;
224
225 def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile>;
226 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile>;
227
228 } // End isCommutable = 1
221229
222230 } // End SubtargetPredicate = isVI
223231
414422 defm V_MAD_U16 : VOP3_Real_vi <0x1eb>;
415423 defm V_MAD_I16 : VOP3_Real_vi <0x1ec>;
416424
425 defm V_FMA_F16 : VOP3_Real_vi <0x1ee>;
426 defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>;
427
428 defm V_INTERP_P1LL_F16 : VOP3_Real_vi <0x274>;
429 defm V_INTERP_P1LV_F16 : VOP3_Real_vi <0x275>;
430 defm V_INTERP_P2_F16 : VOP3_Real_vi <0x276>;
417431 defm V_ADD_F64 : VOP3_Real_vi <0x280>;
418432 defm V_MUL_F64 : VOP3_Real_vi <0x281>;
419433 defm V_MIN_F64 : VOP3_Real_vi <0x282>;
143143 }
144144 }
145145
146 def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
146147 def VOPC_I1_F32_F32 : VOPC_Profile<[Write32Bit], f32>;
147148 def VOPC_I1_F64_F64 : VOPC_Profile<[WriteDoubleAdd], f64>;
148149 def VOPC_I1_I32_I32 : VOPC_Profile<[Write32Bit], i32>;
149150 def VOPC_I1_I64_I64 : VOPC_Profile<[Write64Bit], i64>;
150151
152 multiclass VOPC_F16 :
153 VOPC_Pseudos ;
154
151155 multiclass VOPC_F32 :
152156 VOPC_Pseudos ;
153157
159163
160164 multiclass VOPC_I64 :
161165 VOPC_Pseudos ;
166
167 multiclass VOPCX_F16 :
168 VOPC_Pseudos ;
162169
163170 multiclass VOPCX_F32 :
164171 VOPC_Pseudos ;
317324
318325 } // End SubtargetPredicate = isSICI
319326
327 let SubtargetPredicate = isVI in {
328
329 defm V_CMP_F_F16 : VOPC_F16 <"v_cmp_f_f16">;
330 defm V_CMP_LT_F16 : VOPC_F16 <"v_cmp_lt_f16", COND_OLT, "v_cmp_gt_f16">;
331 defm V_CMP_EQ_F16 : VOPC_F16 <"v_cmp_eq_f16", COND_OEQ>;
332 defm V_CMP_LE_F16 : VOPC_F16 <"v_cmp_le_f16", COND_OLE, "v_cmp_ge_f16">;
333 defm V_CMP_GT_F16 : VOPC_F16 <"v_cmp_gt_f16", COND_OGT>;
334 defm V_CMP_LG_F16 : VOPC_F16 <"v_cmp_lg_f16", COND_ONE>;
335 defm V_CMP_GE_F16 : VOPC_F16 <"v_cmp_ge_f16", COND_OGE>;
336 defm V_CMP_O_F16 : VOPC_F16 <"v_cmp_o_f16", COND_O>;
337 defm V_CMP_U_F16 : VOPC_F16 <"v_cmp_u_f16", COND_UO>;
338 defm V_CMP_NGE_F16 : VOPC_F16 <"v_cmp_nge_f16", COND_ULT, "v_cmp_nle_f16">;
339 defm V_CMP_NLG_F16 : VOPC_F16 <"v_cmp_nlg_f16", COND_UEQ>;
340 defm V_CMP_NGT_F16 : VOPC_F16 <"v_cmp_ngt_f16", COND_ULE, "v_cmp_nlt_f16">;
341 defm V_CMP_NLE_F16 : VOPC_F16 <"v_cmp_nle_f16", COND_UGT>;
342 defm V_CMP_NEQ_F16 : VOPC_F16 <"v_cmp_neq_f16", COND_UNE>;
343 defm V_CMP_NLT_F16 : VOPC_F16 <"v_cmp_nlt_f16", COND_UGE>;
344 defm V_CMP_TRU_F16 : VOPC_F16 <"v_cmp_tru_f16">;
345
346 defm V_CMPX_F_F16 : VOPCX_F16 <"v_cmpx_f_f16">;
347 defm V_CMPX_LT_F16 : VOPCX_F16 <"v_cmpx_lt_f16", "v_cmpx_gt_f16">;
348 defm V_CMPX_EQ_F16 : VOPCX_F16 <"v_cmpx_eq_f16">;
349 defm V_CMPX_LE_F16 : VOPCX_F16 <"v_cmpx_le_f16", "v_cmpx_ge_f16">;
350 defm V_CMPX_GT_F16 : VOPCX_F16 <"v_cmpx_gt_f16">;
351 defm V_CMPX_LG_F16 : VOPCX_F16 <"v_cmpx_lg_f16">;
352 defm V_CMPX_GE_F16 : VOPCX_F16 <"v_cmpx_ge_f16">;
353 defm V_CMPX_O_F16 : VOPCX_F16 <"v_cmpx_o_f16">;
354 defm V_CMPX_U_F16 : VOPCX_F16 <"v_cmpx_u_f16">;
355 defm V_CMPX_NGE_F16 : VOPCX_F16 <"v_cmpx_nge_f16">;
356 defm V_CMPX_NLG_F16 : VOPCX_F16 <"v_cmpx_nlg_f16">;
357 defm V_CMPX_NGT_F16 : VOPCX_F16 <"v_cmpx_ngt_f16">;
358 defm V_CMPX_NLE_F16 : VOPCX_F16 <"v_cmpx_nle_f16">;
359 defm V_CMPX_NEQ_F16 : VOPCX_F16 <"v_cmpx_neq_f16">;
360 defm V_CMPX_NLT_F16 : VOPCX_F16 <"v_cmpx_nlt_f16">;
361 defm V_CMPX_TRU_F16 : VOPCX_F16 <"v_cmpx_tru_f16">;
362
363 } // End SubtargetPredicate = isVI
364
320365 defm V_CMP_F_I32 : VOPC_I32 <"v_cmp_f_i32">;
321366 defm V_CMP_LT_I32 : VOPC_I32 <"v_cmp_lt_i32", COND_SLT, "v_cmp_gt_i32">;
322367 defm V_CMP_EQ_I32 : VOPC_I32 <"v_cmp_eq_i32">;
428473 }
429474 }
430475
476 def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
431477 def VOPC_I1_F32_I32 : VOPC_Class_Profile<[Write32Bit], f32>;
432478 def VOPC_I1_F64_I32 : VOPC_Class_Profile<[WriteDoubleAdd], f64>;
479
480 multiclass VOPC_CLASS_F16 :
481 VOPC_Class_Pseudos ;
482
483 multiclass VOPCX_CLASS_F16 :
484 VOPC_Class_Pseudos ;
433485
434486 multiclass VOPC_CLASS_F32 :
435487 VOPC_Class_Pseudos ;
447499 defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
448500 defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
449501 defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">;
502 defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">;
503 defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
450504
451505 //===----------------------------------------------------------------------===//
452506 // V_ICMPIntrinsic Pattern.
809863 }
810864 }
811865
812 defm V_CMP_F_F32 : VOPC_Real_vi <0x40>;
813 defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>;
814 defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>;
815 defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>;
816 defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>;
817 defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>;
818 defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>;
819 defm V_CMP_O_F32 : VOPC_Real_vi <0x47>;
820 defm V_CMP_U_F32 : VOPC_Real_vi <0x48>;
821 defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>;
822 defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>;
823 defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>;
824 defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>;
825 defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>;
826 defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>;
827 defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>;
828
829 defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>;
830 defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>;
831 defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>;
832 defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>;
833 defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>;
834 defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>;
835 defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>;
836 defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>;
837 defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>;
838 defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>;
839 defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>;
840 defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>;
841 defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>;
842 defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>;
843 defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>;
844 defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>;
845
846 defm V_CMP_F_F64 : VOPC_Real_vi <0x60>;
847 defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>;
848 defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>;
849 defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>;
850 defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>;
851 defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>;
852 defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>;
853 defm V_CMP_O_F64 : VOPC_Real_vi <0x67>;
854 defm V_CMP_U_F64 : VOPC_Real_vi <0x68>;
855 defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>;
856 defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>;
857 defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>;
858 defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>;
859 defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>;
860 defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>;
861 defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>;
862
863 defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>;
864 defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>;
865 defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>;
866 defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>;
867 defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>;
868 defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>;
869 defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>;
870 defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>;
871 defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>;
872 defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>;
873 defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>;
874 defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>;
875 defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>;
876 defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>;
877 defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>;
878 defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>;
879
880 defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>;
881 defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>;
882 defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>;
883 defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>;
884 defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>;
885 defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>;
886 defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>;
887 defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>;
888
889 defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>;
890 defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>;
891 defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>;
892 defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>;
893 defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>;
894 defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>;
895 defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>;
896 defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>;
897
898 defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>;
899 defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>;
900 defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>;
901 defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>;
902 defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>;
903 defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>;
904 defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>;
905 defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>;
906
907 defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>;
908 defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>;
909 defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>;
910 defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>;
911 defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>;
912 defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>;
913 defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>;
914 defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>;
915
916 defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>;
917 defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>;
918 defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>;
919 defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>;
920 defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>;
921 defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>;
922 defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>;
923 defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>;
924
925 defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>;
926 defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>;
927 defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>;
928 defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>;
929 defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>;
930 defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>;
931 defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>;
932 defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>;
933
934 defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>;
935 defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>;
936 defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>;
937 defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>;
938 defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>;
939 defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>;
940 defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>;
941 defm V_CMP_T_U64 : VOPC_Real_vi <0xef>;
942
943 defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>;
944 defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>;
945 defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>;
946 defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>;
947 defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>;
948 defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>;
949 defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>;
950 defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>;
951
952866 defm V_CMP_CLASS_F32 : VOPC_Real_vi <0x10>;
953867 defm V_CMPX_CLASS_F32 : VOPC_Real_vi <0x11>;
954868 defm V_CMP_CLASS_F64 : VOPC_Real_vi <0x12>;
955869 defm V_CMPX_CLASS_F64 : VOPC_Real_vi <0x13>;
870 defm V_CMP_CLASS_F16 : VOPC_Real_vi <0x14>;
871 defm V_CMPX_CLASS_F16 : VOPC_Real_vi <0x15>;
872
873 defm V_CMP_F_F16 : VOPC_Real_vi <0x20>;
874 defm V_CMP_LT_F16 : VOPC_Real_vi <0x21>;
875 defm V_CMP_EQ_F16 : VOPC_Real_vi <0x22>;
876 defm V_CMP_LE_F16 : VOPC_Real_vi <0x23>;
877 defm V_CMP_GT_F16 : VOPC_Real_vi <0x24>;
878 defm V_CMP_LG_F16 : VOPC_Real_vi <0x25>;
879 defm V_CMP_GE_F16 : VOPC_Real_vi <0x26>;
880 defm V_CMP_O_F16 : VOPC_Real_vi <0x27>;
881 defm V_CMP_U_F16 : VOPC_Real_vi <0x28>;
882 defm V_CMP_NGE_F16 : VOPC_Real_vi <0x29>;
883 defm V_CMP_NLG_F16 : VOPC_Real_vi <0x2a>;
884 defm V_CMP_NGT_F16 : VOPC_Real_vi <0x2b>;
885 defm V_CMP_NLE_F16 : VOPC_Real_vi <0x2c>;
886 defm V_CMP_NEQ_F16 : VOPC_Real_vi <0x2d>;
887 defm V_CMP_NLT_F16 : VOPC_Real_vi <0x2e>;
888 defm V_CMP_TRU_F16 : VOPC_Real_vi <0x2f>;
889
890 defm V_CMPX_F_F16 : VOPC_Real_vi <0x30>;
891 defm V_CMPX_LT_F16 : VOPC_Real_vi <0x31>;
892 defm V_CMPX_EQ_F16 : VOPC_Real_vi <0x32>;
893 defm V_CMPX_LE_F16 : VOPC_Real_vi <0x33>;
894 defm V_CMPX_GT_F16 : VOPC_Real_vi <0x34>;
895 defm V_CMPX_LG_F16 : VOPC_Real_vi <0x35>;
896 defm V_CMPX_GE_F16 : VOPC_Real_vi <0x36>;
897 defm V_CMPX_O_F16 : VOPC_Real_vi <0x37>;
898 defm V_CMPX_U_F16 : VOPC_Real_vi <0x38>;
899 defm V_CMPX_NGE_F16 : VOPC_Real_vi <0x39>;
900 defm V_CMPX_NLG_F16 : VOPC_Real_vi <0x3a>;
901 defm V_CMPX_NGT_F16 : VOPC_Real_vi <0x3b>;
902 defm V_CMPX_NLE_F16 : VOPC_Real_vi <0x3c>;
903 defm V_CMPX_NEQ_F16 : VOPC_Real_vi <0x3d>;
904 defm V_CMPX_NLT_F16 : VOPC_Real_vi <0x3e>;
905 defm V_CMPX_TRU_F16 : VOPC_Real_vi <0x3f>;
906
907 defm V_CMP_F_F32 : VOPC_Real_vi <0x40>;
908 defm V_CMP_LT_F32 : VOPC_Real_vi <0x41>;
909 defm V_CMP_EQ_F32 : VOPC_Real_vi <0x42>;
910 defm V_CMP_LE_F32 : VOPC_Real_vi <0x43>;
911 defm V_CMP_GT_F32 : VOPC_Real_vi <0x44>;
912 defm V_CMP_LG_F32 : VOPC_Real_vi <0x45>;
913 defm V_CMP_GE_F32 : VOPC_Real_vi <0x46>;
914 defm V_CMP_O_F32 : VOPC_Real_vi <0x47>;
915 defm V_CMP_U_F32 : VOPC_Real_vi <0x48>;
916 defm V_CMP_NGE_F32 : VOPC_Real_vi <0x49>;
917 defm V_CMP_NLG_F32 : VOPC_Real_vi <0x4a>;
918 defm V_CMP_NGT_F32 : VOPC_Real_vi <0x4b>;
919 defm V_CMP_NLE_F32 : VOPC_Real_vi <0x4c>;
920 defm V_CMP_NEQ_F32 : VOPC_Real_vi <0x4d>;
921 defm V_CMP_NLT_F32 : VOPC_Real_vi <0x4e>;
922 defm V_CMP_TRU_F32 : VOPC_Real_vi <0x4f>;
923
924 defm V_CMPX_F_F32 : VOPC_Real_vi <0x50>;
925 defm V_CMPX_LT_F32 : VOPC_Real_vi <0x51>;
926 defm V_CMPX_EQ_F32 : VOPC_Real_vi <0x52>;
927 defm V_CMPX_LE_F32 : VOPC_Real_vi <0x53>;
928 defm V_CMPX_GT_F32 : VOPC_Real_vi <0x54>;
929 defm V_CMPX_LG_F32 : VOPC_Real_vi <0x55>;
930 defm V_CMPX_GE_F32 : VOPC_Real_vi <0x56>;
931 defm V_CMPX_O_F32 : VOPC_Real_vi <0x57>;
932 defm V_CMPX_U_F32 : VOPC_Real_vi <0x58>;
933 defm V_CMPX_NGE_F32 : VOPC_Real_vi <0x59>;
934 defm V_CMPX_NLG_F32 : VOPC_Real_vi <0x5a>;
935 defm V_CMPX_NGT_F32 : VOPC_Real_vi <0x5b>;
936 defm V_CMPX_NLE_F32 : VOPC_Real_vi <0x5c>;
937 defm V_CMPX_NEQ_F32 : VOPC_Real_vi <0x5d>;
938 defm V_CMPX_NLT_F32 : VOPC_Real_vi <0x5e>;
939 defm V_CMPX_TRU_F32 : VOPC_Real_vi <0x5f>;
940
941 defm V_CMP_F_F64 : VOPC_Real_vi <0x60>;
942 defm V_CMP_LT_F64 : VOPC_Real_vi <0x61>;
943 defm V_CMP_EQ_F64 : VOPC_Real_vi <0x62>;
944 defm V_CMP_LE_F64 : VOPC_Real_vi <0x63>;
945 defm V_CMP_GT_F64 : VOPC_Real_vi <0x64>;
946 defm V_CMP_LG_F64 : VOPC_Real_vi <0x65>;
947 defm V_CMP_GE_F64 : VOPC_Real_vi <0x66>;
948 defm V_CMP_O_F64 : VOPC_Real_vi <0x67>;
949 defm V_CMP_U_F64 : VOPC_Real_vi <0x68>;
950 defm V_CMP_NGE_F64 : VOPC_Real_vi <0x69>;
951 defm V_CMP_NLG_F64 : VOPC_Real_vi <0x6a>;
952 defm V_CMP_NGT_F64 : VOPC_Real_vi <0x6b>;
953 defm V_CMP_NLE_F64 : VOPC_Real_vi <0x6c>;
954 defm V_CMP_NEQ_F64 : VOPC_Real_vi <0x6d>;
955 defm V_CMP_NLT_F64 : VOPC_Real_vi <0x6e>;
956 defm V_CMP_TRU_F64 : VOPC_Real_vi <0x6f>;
957
958 defm V_CMPX_F_F64 : VOPC_Real_vi <0x70>;
959 defm V_CMPX_LT_F64 : VOPC_Real_vi <0x71>;
960 defm V_CMPX_EQ_F64 : VOPC_Real_vi <0x72>;
961 defm V_CMPX_LE_F64 : VOPC_Real_vi <0x73>;
962 defm V_CMPX_GT_F64 : VOPC_Real_vi <0x74>;
963 defm V_CMPX_LG_F64 : VOPC_Real_vi <0x75>;
964 defm V_CMPX_GE_F64 : VOPC_Real_vi <0x76>;
965 defm V_CMPX_O_F64 : VOPC_Real_vi <0x77>;
966 defm V_CMPX_U_F64 : VOPC_Real_vi <0x78>;
967 defm V_CMPX_NGE_F64 : VOPC_Real_vi <0x79>;
968 defm V_CMPX_NLG_F64 : VOPC_Real_vi <0x7a>;
969 defm V_CMPX_NGT_F64 : VOPC_Real_vi <0x7b>;
970 defm V_CMPX_NLE_F64 : VOPC_Real_vi <0x7c>;
971 defm V_CMPX_NEQ_F64 : VOPC_Real_vi <0x7d>;
972 defm V_CMPX_NLT_F64 : VOPC_Real_vi <0x7e>;
973 defm V_CMPX_TRU_F64 : VOPC_Real_vi <0x7f>;
974
975 defm V_CMP_F_I32 : VOPC_Real_vi <0xc0>;
976 defm V_CMP_LT_I32 : VOPC_Real_vi <0xc1>;
977 defm V_CMP_EQ_I32 : VOPC_Real_vi <0xc2>;
978 defm V_CMP_LE_I32 : VOPC_Real_vi <0xc3>;
979 defm V_CMP_GT_I32 : VOPC_Real_vi <0xc4>;
980 defm V_CMP_NE_I32 : VOPC_Real_vi <0xc5>;
981 defm V_CMP_GE_I32 : VOPC_Real_vi <0xc6>;
982 defm V_CMP_T_I32 : VOPC_Real_vi <0xc7>;
983
984 defm V_CMPX_F_I32 : VOPC_Real_vi <0xd0>;
985 defm V_CMPX_LT_I32 : VOPC_Real_vi <0xd1>;
986 defm V_CMPX_EQ_I32 : VOPC_Real_vi <0xd2>;
987 defm V_CMPX_LE_I32 : VOPC_Real_vi <0xd3>;
988 defm V_CMPX_GT_I32 : VOPC_Real_vi <0xd4>;
989 defm V_CMPX_NE_I32 : VOPC_Real_vi <0xd5>;
990 defm V_CMPX_GE_I32 : VOPC_Real_vi <0xd6>;
991 defm V_CMPX_T_I32 : VOPC_Real_vi <0xd7>;
992
993 defm V_CMP_F_I64 : VOPC_Real_vi <0xe0>;
994 defm V_CMP_LT_I64 : VOPC_Real_vi <0xe1>;
995 defm V_CMP_EQ_I64 : VOPC_Real_vi <0xe2>;
996 defm V_CMP_LE_I64 : VOPC_Real_vi <0xe3>;
997 defm V_CMP_GT_I64 : VOPC_Real_vi <0xe4>;
998 defm V_CMP_NE_I64 : VOPC_Real_vi <0xe5>;
999 defm V_CMP_GE_I64 : VOPC_Real_vi <0xe6>;
1000 defm V_CMP_T_I64 : VOPC_Real_vi <0xe7>;
1001
1002 defm V_CMPX_F_I64 : VOPC_Real_vi <0xf0>;
1003 defm V_CMPX_LT_I64 : VOPC_Real_vi <0xf1>;
1004 defm V_CMPX_EQ_I64 : VOPC_Real_vi <0xf2>;
1005 defm V_CMPX_LE_I64 : VOPC_Real_vi <0xf3>;
1006 defm V_CMPX_GT_I64 : VOPC_Real_vi <0xf4>;
1007 defm V_CMPX_NE_I64 : VOPC_Real_vi <0xf5>;
1008 defm V_CMPX_GE_I64 : VOPC_Real_vi <0xf6>;
1009 defm V_CMPX_T_I64 : VOPC_Real_vi <0xf7>;
1010
1011 defm V_CMP_F_U32 : VOPC_Real_vi <0xc8>;
1012 defm V_CMP_LT_U32 : VOPC_Real_vi <0xc9>;
1013 defm V_CMP_EQ_U32 : VOPC_Real_vi <0xca>;
1014 defm V_CMP_LE_U32 : VOPC_Real_vi <0xcb>;
1015 defm V_CMP_GT_U32 : VOPC_Real_vi <0xcc>;
1016 defm V_CMP_NE_U32 : VOPC_Real_vi <0xcd>;
1017 defm V_CMP_GE_U32 : VOPC_Real_vi <0xce>;
1018 defm V_CMP_T_U32 : VOPC_Real_vi <0xcf>;
1019
1020 defm V_CMPX_F_U32 : VOPC_Real_vi <0xd8>;
1021 defm V_CMPX_LT_U32 : VOPC_Real_vi <0xd9>;
1022 defm V_CMPX_EQ_U32 : VOPC_Real_vi <0xda>;
1023 defm V_CMPX_LE_U32 : VOPC_Real_vi <0xdb>;
1024 defm V_CMPX_GT_U32 : VOPC_Real_vi <0xdc>;
1025 defm V_CMPX_NE_U32 : VOPC_Real_vi <0xdd>;
1026 defm V_CMPX_GE_U32 : VOPC_Real_vi <0xde>;
1027 defm V_CMPX_T_U32 : VOPC_Real_vi <0xdf>;
1028
1029 defm V_CMP_F_U64 : VOPC_Real_vi <0xe8>;
1030 defm V_CMP_LT_U64 : VOPC_Real_vi <0xe9>;
1031 defm V_CMP_EQ_U64 : VOPC_Real_vi <0xea>;
1032 defm V_CMP_LE_U64 : VOPC_Real_vi <0xeb>;
1033 defm V_CMP_GT_U64 : VOPC_Real_vi <0xec>;
1034 defm V_CMP_NE_U64 : VOPC_Real_vi <0xed>;
1035 defm V_CMP_GE_U64 : VOPC_Real_vi <0xee>;
1036 defm V_CMP_T_U64 : VOPC_Real_vi <0xef>;
1037
1038 defm V_CMPX_F_U64 : VOPC_Real_vi <0xf8>;
1039 defm V_CMPX_LT_U64 : VOPC_Real_vi <0xf9>;
1040 defm V_CMPX_EQ_U64 : VOPC_Real_vi <0xfa>;
1041 defm V_CMPX_LE_U64 : VOPC_Real_vi <0xfb>;
1042 defm V_CMPX_GT_U64 : VOPC_Real_vi <0xfc>;
1043 defm V_CMPX_NE_U64 : VOPC_Real_vi <0xfd>;
1044 defm V_CMPX_GE_U64 : VOPC_Real_vi <0xfe>;
1045 defm V_CMPX_T_U64 : VOPC_Real_vi <0xff>;
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; GCN-LABEL: {{^}}fadd_f16
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
6 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
7 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
8 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
9 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
10 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
11 ; GCN: buffer_store_short v[[R_F16]]
12 ; GCN: s_endpgm
13 define void @fadd_f16(
14 half addrspace(1)* %r,
15 half addrspace(1)* %a,
16 half addrspace(1)* %b) {
17 entry:
18 %a.val = load half, half addrspace(1)* %a
19 %b.val = load half, half addrspace(1)* %b
20 %r.val = fadd half %a.val, %b.val
21 store half %r.val, half addrspace(1)* %r
22 ret void
23 }
24
25 ; GCN-LABEL: {{^}}fadd_f16_imm_a
26 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
27 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}}
28 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
29 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
30 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
31 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]]
32 ; GCN: buffer_store_short v[[R_F16]]
33 ; GCN: s_endpgm
34 define void @fadd_f16_imm_a(
35 half addrspace(1)* %r,
36 half addrspace(1)* %b) {
37 entry:
38 %b.val = load half, half addrspace(1)* %b
39 %r.val = fadd half 1.0, %b.val
40 store half %r.val, half addrspace(1)* %r
41 ret void
42 }
43
44 ; GCN-LABEL: {{^}}fadd_f16_imm_b
45 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
46 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4000{{$}}
47 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
48 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
49 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
50 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[A_F16]]
51 ; GCN: buffer_store_short v[[R_F16]]
52 ; GCN: s_endpgm
53 define void @fadd_f16_imm_b(
54 half addrspace(1)* %r,
55 half addrspace(1)* %a) {
56 entry:
57 %a.val = load half, half addrspace(1)* %a
58 %r.val = fadd half %a.val, 2.0
59 store half %r.val, half addrspace(1)* %r
60 ret void
61 }
62
63 ; GCN-LABEL: {{^}}fadd_v2f16
64 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
65 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
66 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
67 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
68 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
69 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
70 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
71 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
74 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
75 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
76 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
77 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
78 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
79 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
80 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
81 ; GCN: buffer_store_dword v[[R_V2_F16]]
82 ; GCN: s_endpgm
83 define void @fadd_v2f16(
84 <2 x half> addrspace(1)* %r,
85 <2 x half> addrspace(1)* %a,
86 <2 x half> addrspace(1)* %b) {
87 entry:
88 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
89 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
90 %r.val = fadd <2 x half> %a.val, %b.val
91 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
92 ret void
93 }
94
95 ; GCN-LABEL: {{^}}fadd_v2f16_imm_a
96 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
97 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}}
98 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}}
99 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
100 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
101 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
102 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
103 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
104 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
105 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
106 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]]
107 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]]
108 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
109 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
111 ; GCN: buffer_store_dword v[[R_V2_F16]]
112 ; GCN: s_endpgm
113 define void @fadd_v2f16_imm_a(
114 <2 x half> addrspace(1)* %r,
115 <2 x half> addrspace(1)* %b) {
116 entry:
117 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
118 %r.val = fadd <2 x half> , %b.val
119 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
120 ret void
121 }
122
123 ; GCN-LABEL: {{^}}fadd_v2f16_imm_b
124 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
125 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}}
126 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}}
127 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
128 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
129 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
130 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
131 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
132 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
133 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
134 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x4000, v[[A_V2_F16]]
135 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x3c00, v[[A_F16_1]]
136 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
137 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
138 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
139 ; GCN: buffer_store_dword v[[R_V2_F16]]
140 ; GCN: s_endpgm
141 define void @fadd_v2f16_imm_b(
142 <2 x half> addrspace(1)* %r,
143 <2 x half> addrspace(1)* %a) {
144 entry:
145 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
146 %r.val = fadd <2 x half> %a.val,
147 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
148 ret void
149 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; GCN-LABEL: {{^}}fcmp_f16_lt
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
6 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
7 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
8 ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
9 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
10 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
11 ; GCN: buffer_store_dword v[[R_I32]]
12 ; GCN: s_endpgm
13 define void @fcmp_f16_lt(
14 i32 addrspace(1)* %r,
15 half addrspace(1)* %a,
16 half addrspace(1)* %b) {
17 entry:
18 %a.val = load half, half addrspace(1)* %a
19 %b.val = load half, half addrspace(1)* %b
20 %r.val = fcmp olt half %a.val, %b.val
21 %r.val.sext = sext i1 %r.val to i32
22 store i32 %r.val.sext, i32 addrspace(1)* %r
23 ret void
24 }
25
26 ; GCN-LABEL: {{^}}fcmp_f16_eq
27 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
28 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
29 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
30 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
31 ; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
32 ; VI: v_cmp_eq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
33 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
34 ; GCN: buffer_store_dword v[[R_I32]]
35 ; GCN: s_endpgm
36 define void @fcmp_f16_eq(
37 i32 addrspace(1)* %r,
38 half addrspace(1)* %a,
39 half addrspace(1)* %b) {
40 entry:
41 %a.val = load half, half addrspace(1)* %a
42 %b.val = load half, half addrspace(1)* %b
43 %r.val = fcmp oeq half %a.val, %b.val
44 %r.val.sext = sext i1 %r.val to i32
45 store i32 %r.val.sext, i32 addrspace(1)* %r
46 ret void
47 }
48
49 ; GCN-LABEL: {{^}}fcmp_f16_le
50 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
51 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
52 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
53 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
54 ; SI: v_cmp_le_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
55 ; VI: v_cmp_le_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
56 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
57 ; GCN: buffer_store_dword v[[R_I32]]
58 ; GCN: s_endpgm
59 define void @fcmp_f16_le(
60 i32 addrspace(1)* %r,
61 half addrspace(1)* %a,
62 half addrspace(1)* %b) {
63 entry:
64 %a.val = load half, half addrspace(1)* %a
65 %b.val = load half, half addrspace(1)* %b
66 %r.val = fcmp ole half %a.val, %b.val
67 %r.val.sext = sext i1 %r.val to i32
68 store i32 %r.val.sext, i32 addrspace(1)* %r
69 ret void
70 }
71
72 ; GCN-LABEL: {{^}}fcmp_f16_gt
73 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
74 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
75 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
76 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
77 ; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
78 ; VI: v_cmp_gt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
79 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
80 ; GCN: buffer_store_dword v[[R_I32]]
81 ; GCN: s_endpgm
82 define void @fcmp_f16_gt(
83 i32 addrspace(1)* %r,
84 half addrspace(1)* %a,
85 half addrspace(1)* %b) {
86 entry:
87 %a.val = load half, half addrspace(1)* %a
88 %b.val = load half, half addrspace(1)* %b
89 %r.val = fcmp ogt half %a.val, %b.val
90 %r.val.sext = sext i1 %r.val to i32
91 store i32 %r.val.sext, i32 addrspace(1)* %r
92 ret void
93 }
94
95 ; GCN-LABEL: {{^}}fcmp_f16_lg
96 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
97 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
98 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
99 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
100 ; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
101 ; VI: v_cmp_lg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
102 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
103 ; GCN: buffer_store_dword v[[R_I32]]
104 ; GCN: s_endpgm
105 define void @fcmp_f16_lg(
106 i32 addrspace(1)* %r,
107 half addrspace(1)* %a,
108 half addrspace(1)* %b) {
109 entry:
110 %a.val = load half, half addrspace(1)* %a
111 %b.val = load half, half addrspace(1)* %b
112 %r.val = fcmp one half %a.val, %b.val
113 %r.val.sext = sext i1 %r.val to i32
114 store i32 %r.val.sext, i32 addrspace(1)* %r
115 ret void
116 }
117
118 ; GCN-LABEL: {{^}}fcmp_f16_ge
119 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
120 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
121 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
122 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
123 ; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
124 ; VI: v_cmp_ge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
125 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
126 ; GCN: buffer_store_dword v[[R_I32]]
127 ; GCN: s_endpgm
128 define void @fcmp_f16_ge(
129 i32 addrspace(1)* %r,
130 half addrspace(1)* %a,
131 half addrspace(1)* %b) {
132 entry:
133 %a.val = load half, half addrspace(1)* %a
134 %b.val = load half, half addrspace(1)* %b
135 %r.val = fcmp oge half %a.val, %b.val
136 %r.val.sext = sext i1 %r.val to i32
137 store i32 %r.val.sext, i32 addrspace(1)* %r
138 ret void
139 }
140
141 ; GCN-LABEL: {{^}}fcmp_f16_o
142 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
143 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
144 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
145 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
146 ; SI: v_cmp_o_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
147 ; VI: v_cmp_o_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
148 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
149 ; GCN: buffer_store_dword v[[R_I32]]
150 ; GCN: s_endpgm
151 define void @fcmp_f16_o(
152 i32 addrspace(1)* %r,
153 half addrspace(1)* %a,
154 half addrspace(1)* %b) {
155 entry:
156 %a.val = load half, half addrspace(1)* %a
157 %b.val = load half, half addrspace(1)* %b
158 %r.val = fcmp ord half %a.val, %b.val
159 %r.val.sext = sext i1 %r.val to i32
160 store i32 %r.val.sext, i32 addrspace(1)* %r
161 ret void
162 }
163
164 ; GCN-LABEL: {{^}}fcmp_f16_u
165 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
166 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
167 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
168 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
169 ; SI: v_cmp_u_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
170 ; VI: v_cmp_u_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
171 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
172 ; GCN: buffer_store_dword v[[R_I32]]
173 ; GCN: s_endpgm
174 define void @fcmp_f16_u(
175 i32 addrspace(1)* %r,
176 half addrspace(1)* %a,
177 half addrspace(1)* %b) {
178 entry:
179 %a.val = load half, half addrspace(1)* %a
180 %b.val = load half, half addrspace(1)* %b
181 %r.val = fcmp uno half %a.val, %b.val
182 %r.val.sext = sext i1 %r.val to i32
183 store i32 %r.val.sext, i32 addrspace(1)* %r
184 ret void
185 }
186
187 ; GCN-LABEL: {{^}}fcmp_f16_nge
188 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
189 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
190 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
191 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
192 ; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
193 ; VI: v_cmp_nge_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
194 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
195 ; GCN: buffer_store_dword v[[R_I32]]
196 ; GCN: s_endpgm
197 define void @fcmp_f16_nge(
198 i32 addrspace(1)* %r,
199 half addrspace(1)* %a,
200 half addrspace(1)* %b) {
201 entry:
202 %a.val = load half, half addrspace(1)* %a
203 %b.val = load half, half addrspace(1)* %b
204 %r.val = fcmp ult half %a.val, %b.val
205 %r.val.sext = sext i1 %r.val to i32
206 store i32 %r.val.sext, i32 addrspace(1)* %r
207 ret void
208 }
209
210 ; GCN-LABEL: {{^}}fcmp_f16_nlg
211 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
212 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
213 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
214 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
215 ; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
216 ; VI: v_cmp_nlg_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
217 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
218 ; GCN: buffer_store_dword v[[R_I32]]
219 ; GCN: s_endpgm
220 define void @fcmp_f16_nlg(
221 i32 addrspace(1)* %r,
222 half addrspace(1)* %a,
223 half addrspace(1)* %b) {
224 entry:
225 %a.val = load half, half addrspace(1)* %a
226 %b.val = load half, half addrspace(1)* %b
227 %r.val = fcmp ueq half %a.val, %b.val
228 %r.val.sext = sext i1 %r.val to i32
229 store i32 %r.val.sext, i32 addrspace(1)* %r
230 ret void
231 }
232
233 ; GCN-LABEL: {{^}}fcmp_f16_ngt
234 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
235 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
236 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
237 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
238 ; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
239 ; VI: v_cmp_ngt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
240 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
241 ; GCN: buffer_store_dword v[[R_I32]]
242 ; GCN: s_endpgm
243 define void @fcmp_f16_ngt(
244 i32 addrspace(1)* %r,
245 half addrspace(1)* %a,
246 half addrspace(1)* %b) {
247 entry:
248 %a.val = load half, half addrspace(1)* %a
249 %b.val = load half, half addrspace(1)* %b
250 %r.val = fcmp ule half %a.val, %b.val
251 %r.val.sext = sext i1 %r.val to i32
252 store i32 %r.val.sext, i32 addrspace(1)* %r
253 ret void
254 }
255
256 ; GCN-LABEL: {{^}}fcmp_f16_nle
257 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
258 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
259 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
260 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
261 ; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
262 ; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
263 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
264 ; GCN: buffer_store_dword v[[R_I32]]
265 ; GCN: s_endpgm
266 define void @fcmp_f16_nle(
267 i32 addrspace(1)* %r,
268 half addrspace(1)* %a,
269 half addrspace(1)* %b) {
270 entry:
271 %a.val = load half, half addrspace(1)* %a
272 %b.val = load half, half addrspace(1)* %b
273 %r.val = fcmp ugt half %a.val, %b.val
274 %r.val.sext = sext i1 %r.val to i32
275 store i32 %r.val.sext, i32 addrspace(1)* %r
276 ret void
277 }
278
279 ; GCN-LABEL: {{^}}fcmp_f16_neq
280 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
281 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
282 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
283 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
284 ; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
285 ; VI: v_cmp_neq_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
286 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
287 ; GCN: buffer_store_dword v[[R_I32]]
288 ; GCN: s_endpgm
289 define void @fcmp_f16_neq(
290 i32 addrspace(1)* %r,
291 half addrspace(1)* %a,
292 half addrspace(1)* %b) {
293 entry:
294 %a.val = load half, half addrspace(1)* %a
295 %b.val = load half, half addrspace(1)* %b
296 %r.val = fcmp une half %a.val, %b.val
297 %r.val.sext = sext i1 %r.val to i32
298 store i32 %r.val.sext, i32 addrspace(1)* %r
299 ret void
300 }
301
302 ; GCN-LABEL: {{^}}fcmp_f16_nlt
303 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
304 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
305 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
306 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
307 ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
308 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
309 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
310 ; GCN: buffer_store_dword v[[R_I32]]
311 ; GCN: s_endpgm
312 define void @fcmp_f16_nlt(
313 i32 addrspace(1)* %r,
314 half addrspace(1)* %a,
315 half addrspace(1)* %b) {
316 entry:
317 %a.val = load half, half addrspace(1)* %a
318 %b.val = load half, half addrspace(1)* %b
319 %r.val = fcmp uge half %a.val, %b.val
320 %r.val.sext = sext i1 %r.val to i32
321 store i32 %r.val.sext, i32 addrspace(1)* %r
322 ret void
323 }
324
325 ; GCN-LABEL: {{^}}fcmp_v2f16_lt
326 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
327 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
328 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
329 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
330 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
331 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
332 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
333 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
334 ; SI: v_cmp_lt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
335 ; SI: v_cmp_lt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
336 ; VI: v_cmp_lt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
337 ; VI: v_cmp_lt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
338 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
339 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
340 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
341 ; GCN: s_endpgm
342 define void @fcmp_v2f16_lt(
343 <2 x i32> addrspace(1)* %r,
344 <2 x half> addrspace(1)* %a,
345 <2 x half> addrspace(1)* %b) {
346 entry:
347 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
348 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
349 %r.val = fcmp olt <2 x half> %a.val, %b.val
350 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
351 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
352 ret void
353 }
354
355 ; GCN-LABEL: {{^}}fcmp_v2f16_eq
356 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
357 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
358 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
359 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
360 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
361 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
362 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
363 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
364 ; SI: v_cmp_eq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
365 ; SI: v_cmp_eq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
366 ; VI: v_cmp_eq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
367 ; VI: v_cmp_eq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
368 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
369 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
370 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
371 ; GCN: s_endpgm
372 define void @fcmp_v2f16_eq(
373 <2 x i32> addrspace(1)* %r,
374 <2 x half> addrspace(1)* %a,
375 <2 x half> addrspace(1)* %b) {
376 entry:
377 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
378 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
379 %r.val = fcmp oeq <2 x half> %a.val, %b.val
380 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
381 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
382 ret void
383 }
384
385 ; GCN-LABEL: {{^}}fcmp_v2f16_le
386 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
387 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
388 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
389 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
390 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
391 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
392 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
393 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
394 ; SI: v_cmp_le_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
395 ; SI: v_cmp_le_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
396 ; VI: v_cmp_le_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
397 ; VI: v_cmp_le_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
398 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
399 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
400 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
401 ; GCN: s_endpgm
402 define void @fcmp_v2f16_le(
403 <2 x i32> addrspace(1)* %r,
404 <2 x half> addrspace(1)* %a,
405 <2 x half> addrspace(1)* %b) {
406 entry:
407 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
408 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
409 %r.val = fcmp ole <2 x half> %a.val, %b.val
410 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
411 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
412 ret void
413 }
414
415 ; GCN-LABEL: {{^}}fcmp_v2f16_gt
416 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
417 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
418 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
419 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
420 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
421 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
422 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
423 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
424 ; SI: v_cmp_gt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
425 ; SI: v_cmp_gt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
426 ; VI: v_cmp_gt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
427 ; VI: v_cmp_gt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
428 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
429 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
430 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
431 ; GCN: s_endpgm
432 define void @fcmp_v2f16_gt(
433 <2 x i32> addrspace(1)* %r,
434 <2 x half> addrspace(1)* %a,
435 <2 x half> addrspace(1)* %b) {
436 entry:
437 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
438 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
439 %r.val = fcmp ogt <2 x half> %a.val, %b.val
440 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
441 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
442 ret void
443 }
444
445 ; GCN-LABEL: {{^}}fcmp_v2f16_lg
446 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
447 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
448 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
449 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
450 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
451 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
452 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
453 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
454 ; SI: v_cmp_lg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
455 ; SI: v_cmp_lg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
456 ; VI: v_cmp_lg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
457 ; VI: v_cmp_lg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
458 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
459 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
460 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
461 ; GCN: s_endpgm
462 define void @fcmp_v2f16_lg(
463 <2 x i32> addrspace(1)* %r,
464 <2 x half> addrspace(1)* %a,
465 <2 x half> addrspace(1)* %b) {
466 entry:
467 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
468 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
469 %r.val = fcmp one <2 x half> %a.val, %b.val
470 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
471 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
472 ret void
473 }
474
475 ; GCN-LABEL: {{^}}fcmp_v2f16_ge
476 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
477 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
478 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
479 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
480 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
481 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
482 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
483 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
484 ; SI: v_cmp_ge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
485 ; SI: v_cmp_ge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
486 ; VI: v_cmp_ge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
487 ; VI: v_cmp_ge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
488 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
489 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
490 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
491 ; GCN: s_endpgm
492 define void @fcmp_v2f16_ge(
493 <2 x i32> addrspace(1)* %r,
494 <2 x half> addrspace(1)* %a,
495 <2 x half> addrspace(1)* %b) {
496 entry:
497 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
498 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
499 %r.val = fcmp oge <2 x half> %a.val, %b.val
500 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
501 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
502 ret void
503 }
504
505 ; GCN-LABEL: {{^}}fcmp_v2f16_o
506 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
507 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
508 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
509 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
510 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
511 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
512 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
513 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
514 ; SI: v_cmp_o_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
515 ; SI: v_cmp_o_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
516 ; VI: v_cmp_o_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
517 ; VI: v_cmp_o_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
518 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
519 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
520 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
521 ; GCN: s_endpgm
522 define void @fcmp_v2f16_o(
523 <2 x i32> addrspace(1)* %r,
524 <2 x half> addrspace(1)* %a,
525 <2 x half> addrspace(1)* %b) {
526 entry:
527 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
528 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
529 %r.val = fcmp ord <2 x half> %a.val, %b.val
530 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
531 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
532 ret void
533 }
534
535 ; GCN-LABEL: {{^}}fcmp_v2f16_u
536 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
537 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
538 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
539 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
540 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
541 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
542 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
543 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
544 ; SI: v_cmp_u_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
545 ; SI: v_cmp_u_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
546 ; VI: v_cmp_u_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
547 ; VI: v_cmp_u_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
548 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
549 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
550 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
551 ; GCN: s_endpgm
552 define void @fcmp_v2f16_u(
553 <2 x i32> addrspace(1)* %r,
554 <2 x half> addrspace(1)* %a,
555 <2 x half> addrspace(1)* %b) {
556 entry:
557 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
558 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
559 %r.val = fcmp uno <2 x half> %a.val, %b.val
560 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
561 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
562 ret void
563 }
564
565 ; GCN-LABEL: {{^}}fcmp_v2f16_nge
566 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
567 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
568 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
569 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
570 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
571 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
572 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
573 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
574 ; SI: v_cmp_nge_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
575 ; SI: v_cmp_nge_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
576 ; VI: v_cmp_nge_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
577 ; VI: v_cmp_nge_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
578 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
579 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
580 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
581 ; GCN: s_endpgm
582 define void @fcmp_v2f16_nge(
583 <2 x i32> addrspace(1)* %r,
584 <2 x half> addrspace(1)* %a,
585 <2 x half> addrspace(1)* %b) {
586 entry:
587 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
588 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
589 %r.val = fcmp ult <2 x half> %a.val, %b.val
590 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
591 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
592 ret void
593 }
594
595 ; GCN-LABEL: {{^}}fcmp_v2f16_nlg
596 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
597 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
598 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
599 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
600 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
601 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
602 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
603 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
604 ; SI: v_cmp_nlg_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
605 ; SI: v_cmp_nlg_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
606 ; VI: v_cmp_nlg_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
607 ; VI: v_cmp_nlg_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
608 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
609 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
610 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
611 ; GCN: s_endpgm
612 define void @fcmp_v2f16_nlg(
613 <2 x i32> addrspace(1)* %r,
614 <2 x half> addrspace(1)* %a,
615 <2 x half> addrspace(1)* %b) {
616 entry:
617 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
618 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
619 %r.val = fcmp ueq <2 x half> %a.val, %b.val
620 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
621 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
622 ret void
623 }
624
625 ; GCN-LABEL: {{^}}fcmp_v2f16_ngt
626 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
627 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
628 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
629 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
630 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
631 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
632 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
633 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
634 ; SI: v_cmp_ngt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
635 ; SI: v_cmp_ngt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
636 ; VI: v_cmp_ngt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
637 ; VI: v_cmp_ngt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
638 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
639 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
640 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
641 ; GCN: s_endpgm
642 define void @fcmp_v2f16_ngt(
643 <2 x i32> addrspace(1)* %r,
644 <2 x half> addrspace(1)* %a,
645 <2 x half> addrspace(1)* %b) {
646 entry:
647 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
648 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
649 %r.val = fcmp ule <2 x half> %a.val, %b.val
650 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
651 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
652 ret void
653 }
654
655 ; GCN-LABEL: {{^}}fcmp_v2f16_nle
656 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
657 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
658 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
659 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
660 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
661 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
662 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
663 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
664 ; SI: v_cmp_nle_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
665 ; SI: v_cmp_nle_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
666 ; VI: v_cmp_nle_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
667 ; VI: v_cmp_nle_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
668 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
669 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
670 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
671 ; GCN: s_endpgm
672 define void @fcmp_v2f16_nle(
673 <2 x i32> addrspace(1)* %r,
674 <2 x half> addrspace(1)* %a,
675 <2 x half> addrspace(1)* %b) {
676 entry:
677 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
678 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
679 %r.val = fcmp ugt <2 x half> %a.val, %b.val
680 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
681 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
682 ret void
683 }
684
685 ; GCN-LABEL: {{^}}fcmp_v2f16_neq
686 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
687 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
688 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
689 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
690 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
691 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
692 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
693 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
694 ; SI: v_cmp_neq_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
695 ; SI: v_cmp_neq_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
696 ; VI: v_cmp_neq_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
697 ; VI: v_cmp_neq_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
698 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
699 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
700 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
701 ; GCN: s_endpgm
702 define void @fcmp_v2f16_neq(
703 <2 x i32> addrspace(1)* %r,
704 <2 x half> addrspace(1)* %a,
705 <2 x half> addrspace(1)* %b) {
706 entry:
707 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
708 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
709 %r.val = fcmp une <2 x half> %a.val, %b.val
710 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
711 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
712 ret void
713 }
714
715 ; GCN-LABEL: {{^}}fcmp_v2f16_nlt
716 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
717 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
718 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
719 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
720 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
721 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
722 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
723 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
724 ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32_0]], v[[B_F32_0]]
725 ; SI: v_cmp_nlt_f32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F32_1]], v[[B_F32_1]]
726 ; VI: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
727 ; VI: v_cmp_nlt_f16_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[A_F16_1]], v[[B_F16_1]]
728 ; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
729 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
730 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
731 ; GCN: s_endpgm
732 define void @fcmp_v2f16_nlt(
733 <2 x i32> addrspace(1)* %r,
734 <2 x half> addrspace(1)* %a,
735 <2 x half> addrspace(1)* %b) {
736 entry:
737 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
738 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
739 %r.val = fcmp uge <2 x half> %a.val, %b.val
740 %r.val.sext = sext <2 x i1> %r.val to <2 x i32>
741 store <2 x i32> %r.val.sext, <2 x i32> addrspace(1)* %r
742 ret void
743 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; Make sure fdiv is promoted to f32.
4
5 ; GCN-LABEL: {{^}}fdiv_f16
6 ; GCN: v_cvt_f32_f16
7 ; GCN: v_cvt_f32_f16
8 ; GCN: v_div_scale_f32
9 ; GCN-DAG: v_div_scale_f32
10 ; GCN-DAG: v_rcp_f32
11 ; GCN: v_fma_f32
12 ; GCN: v_fma_f32
13 ; GCN: v_mul_f32
14 ; GCN: v_fma_f32
15 ; GCN: v_fma_f32
16 ; GCN: v_fma_f32
17 ; GCN: v_div_fmas_f32
18 ; GCN: v_div_fixup_f32
19 ; GCN: v_cvt_f16_f32
20 define void @fdiv_f16(
21 half addrspace(1)* %r,
22 half addrspace(1)* %a,
23 half addrspace(1)* %b) {
24 entry:
25 %a.val = load half, half addrspace(1)* %a
26 %b.val = load half, half addrspace(1)* %b
27 %r.val = fdiv half %a.val, %b.val
28 store half %r.val, half addrspace(1)* %r
29 ret void
30 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; GCN-LABEL: {{^}}fmul_f16
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
6 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
7 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
8 ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
9 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
10 ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
11 ; GCN: buffer_store_short v[[R_F16]]
12 ; GCN: s_endpgm
13 define void @fmul_f16(
14 half addrspace(1)* %r,
15 half addrspace(1)* %a,
16 half addrspace(1)* %b) {
17 entry:
18 %a.val = load half, half addrspace(1)* %a
19 %b.val = load half, half addrspace(1)* %b
20 %r.val = fmul half %a.val, %b.val
21 store half %r.val, half addrspace(1)* %r
22 ret void
23 }
24
25 ; GCN-LABEL: {{^}}fmul_f16_imm_a
26 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
27 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x4200{{$}}
28 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
29 ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
30 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
31 ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
32 ; GCN: buffer_store_short v[[R_F16]]
33 ; GCN: s_endpgm
34 define void @fmul_f16_imm_a(
35 half addrspace(1)* %r,
36 half addrspace(1)* %b) {
37 entry:
38 %b.val = load half, half addrspace(1)* %b
39 %r.val = fmul half 3.0, %b.val
40 store half %r.val, half addrspace(1)* %r
41 ret void
42 }
43
44 ; GCN-LABEL: {{^}}fmul_f16_imm_b
45 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
46 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0x4400{{$}}
47 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
48 ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
49 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
50 ; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
51 ; GCN: buffer_store_short v[[R_F16]]
52 ; GCN: s_endpgm
53 define void @fmul_f16_imm_b(
54 half addrspace(1)* %r,
55 half addrspace(1)* %a) {
56 entry:
57 %a.val = load half, half addrspace(1)* %a
58 %r.val = fmul half %a.val, 4.0
59 store half %r.val, half addrspace(1)* %r
60 ret void
61 }
62
63 ; GCN-LABEL: {{^}}fmul_v2f16
64 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
65 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
66 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
67 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
68 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
69 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
70 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
71 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
74 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
75 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
76 ; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
77 ; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
78 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
79 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
80 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
81 ; GCN: buffer_store_dword v[[R_V2_F16]]
82 ; GCN: s_endpgm
83 define void @fmul_v2f16(
84 <2 x half> addrspace(1)* %r,
85 <2 x half> addrspace(1)* %a,
86 <2 x half> addrspace(1)* %b) {
87 entry:
88 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
89 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
90 %r.val = fmul <2 x half> %a.val, %b.val
91 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
92 ret void
93 }
94
95 ; GCN-LABEL: {{^}}fmul_v2f16_imm_a
96 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
97 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x4200{{$}}
98 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4400{{$}}
99 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
100 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
101 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
102 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
103 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
104 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
105 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
106 ; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
107 ; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
108 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
109 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
111 ; GCN: buffer_store_dword v[[R_V2_F16]]
112 ; GCN: s_endpgm
113 define void @fmul_v2f16_imm_a(
114 <2 x half> addrspace(1)* %r,
115 <2 x half> addrspace(1)* %b) {
116 entry:
117 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
118 %r.val = fmul <2 x half> , %b.val
119 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
120 ret void
121 }
122
123 ; GCN-LABEL: {{^}}fmul_v2f16_imm_b
124 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
125 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4400{{$}}
126 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x4200{{$}}
127 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
128 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
129 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
130 ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
131 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
132 ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
133 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
134 ; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
135 ; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
136 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
137 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
138 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
139 ; GCN: buffer_store_dword v[[R_V2_F16]]
140 ; GCN: s_endpgm
141 define void @fmul_v2f16_imm_b(
142 <2 x half> addrspace(1)* %r,
143 <2 x half> addrspace(1)* %a) {
144 entry:
145 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
146 %r.val = fmul <2 x half> %a.val,
147 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
148 ret void
149 }
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
22 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
33
44 declare float @llvm.fabs.f32(float) #1
248248 }
249249
250250 ; FUNC-LABEL: {{^}}fp_to_sint_f32_i16:
251 ; SI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
252 ; SI: buffer_store_short [[VAL]]
251 ; SI: v_cvt_i32_f32_e32 v[[VAL:[0-9]+]], s{{[0-9]+}}
252 ; VI: v_cvt_f16_f32_e32 v[[IN_F16:[0-9]+]], s{{[0-9]+}}
253 ; VI: v_cvt_i16_f16_e32 v[[VAL:[0-9]+]], v[[IN_F16]]
254 ; SI: buffer_store_short v[[VAL]]
253255 define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
254256 %sint = fptosi float %in to i16
255257 store i16 %sint, i16 addrspace(1)* %out
239239 }
240240
241241 ; FUNC-LABEL: {{^}}fp_to_uint_f32_to_i16:
242 ; The reason different instructions are used on SI and VI is because for
243 ; SI fp_to_uint is legalized by the type legalizer and for VI it is
244 ; legalized by the dag legalizer and they legalize fp_to_uint differently.
245 ; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
246 ; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
247 ; GCN: buffer_store_short [[VAL]]
242 ; SI: v_cvt_u32_f32_e32 v[[VAL:[0-9]+]], s{{[0-9]+}}
243 ; VI: v_cvt_f16_f32_e32 v[[IN_F16:[0-9]+]], s{{[0-9]+}}
244 ; VI: v_cvt_u16_f16_e32 v[[VAL:[0-9]+]], v[[IN_F16]]
245 ; GCN: buffer_store_short v[[VAL]]
248246 define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
249247 %uint = fptoui float %in to i16
250248 store i16 %uint, i16 addrspace(1)* %out
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; GCN-LABEL: {{^}}fpext_f16_to_f32
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]]
6 ; GCN: buffer_store_dword v[[R_F32]]
7 ; GCN: s_endpgm
8 define void @fpext_f16_to_f32(
9 float addrspace(1)* %r,
10 half addrspace(1)* %a) {
11 entry:
12 %a.val = load half, half addrspace(1)* %a
13 %r.val = fpext half %a.val to float
14 store float %r.val, float addrspace(1)* %r
15 ret void
16 }
17
18 ; GCN-LABEL: {{^}}fpext_f16_to_f64
19 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
20 ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
21 ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]]
22 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}}
23 ; GCN: s_endpgm
24 define void @fpext_f16_to_f64(
25 double addrspace(1)* %r,
26 half addrspace(1)* %a) {
27 entry:
28 %a.val = load half, half addrspace(1)* %a
29 %r.val = fpext half %a.val to double
30 store double %r.val, double addrspace(1)* %r
31 ret void
32 }
33
34 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32
35 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
36 ; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
37 ; GCN: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
38 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
39 ; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
40 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
41 ; GCN: s_endpgm
42 define void @fpext_v2f16_to_v2f32(
43 <2 x float> addrspace(1)* %r,
44 <2 x half> addrspace(1)* %a) {
45 entry:
46 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
47 %r.val = fpext <2 x half> %a.val to <2 x float>
48 store <2 x float> %r.val, <2 x float> addrspace(1)* %r
49 ret void
50 }
51
52 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64
53 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
54 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
55 ; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
56 ; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
57 ; GCN: v_cvt_f64_f32_e32 v{{\[}}{{[0-9]+}}:[[R_F64_3:[0-9]+]]{{\]}}, v[[A_F32_1]]
58 ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:{{[0-9]+}}{{\]}}, v[[A_F32_0]]
59 ; GCN: buffer_store_dwordx4 v{{\[}}[[R_F64_0]]:[[R_F64_3]]{{\]}}
60 ; GCN: s_endpgm
61 define void @fpext_v2f16_to_v2f64(
62 <2 x double> addrspace(1)* %r,
63 <2 x half> addrspace(1)* %a) {
64 entry:
65 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
66 %r.val = fpext <2 x half> %a.val to <2 x double>
67 store <2 x double> %r.val, <2 x double> addrspace(1)* %r
68 ret void
69 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; GCN-LABEL: {{^}}fptosi_f16_to_i16
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
6 ; SI: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
7 ; VI: v_cvt_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
8 ; GCN: buffer_store_short v[[R_I16]]
9 ; GCN: s_endpgm
10 define void @fptosi_f16_to_i16(
11 i16 addrspace(1)* %r,
12 half addrspace(1)* %a) {
13 entry:
14 %a.val = load half, half addrspace(1)* %a
15 %r.val = fptosi half %a.val to i16
16 store i16 %r.val, i16 addrspace(1)* %r
17 ret void
18 }
19
20 ; GCN-LABEL: {{^}}fptosi_f16_to_i32
21 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
22 ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
23 ; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
24 ; GCN: buffer_store_dword v[[R_I32]]
25 ; GCN: s_endpgm
26 define void @fptosi_f16_to_i32(
27 i32 addrspace(1)* %r,
28 half addrspace(1)* %a) {
29 entry:
30 %a.val = load half, half addrspace(1)* %a
31 %r.val = fptosi half %a.val to i32
32 store i32 %r.val, i32 addrspace(1)* %r
33 ret void
34 }
35
36 ; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
37 ; test checks code generated for 'i64 = fp_to_sint f32'.
38
39 ; GCN-LABEL: {{^}}fptosi_f16_to_i64
40 ; GCN: buffer_load_ushort
41 ; GCN: v_cvt_f32_f16_e32
42 ; GCN: s_endpgm
43 define void @fptosi_f16_to_i64(
44 i64 addrspace(1)* %r,
45 half addrspace(1)* %a) {
46 entry:
47 %a.val = load half, half addrspace(1)* %a
48 %r.val = fptosi half %a.val to i64
49 store i64 %r.val, i64 addrspace(1)* %r
50 ret void
51 }
52
53 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16
54 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
55 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
56 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
57 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
58 ; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
59 ; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
60 ; VI: v_cvt_i16_f16_e32 v[[R_I16_0:[0-9]+]], v[[A_V2_F16]]
61 ; VI: v_cvt_i16_f16_e32 v[[R_I16_1:[0-9]+]], v[[A_F16_1]]
62 ; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
63 ; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
64 ; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
65 ; GCN: buffer_store_dword v[[R_V2_I16]]
66 ; GCN: s_endpgm
67 define void @fptosi_v2f16_to_v2i16(
68 <2 x i16> addrspace(1)* %r,
69 <2 x half> addrspace(1)* %a) {
70 entry:
71 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
72 %r.val = fptosi <2 x half> %a.val to <2 x i16>
73 store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r
74 ret void
75 }
76
77 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32
78 ; GCN: buffer_load_dword
79 ; GCN: v_cvt_f32_f16_e32
80 ; GCN: v_cvt_f32_f16_e32
81 ; GCN: v_cvt_i32_f32_e32
82 ; GCN: v_cvt_i32_f32_e32
83 ; GCN: buffer_store_dwordx2
84 ; GCN: s_endpgm
85 define void @fptosi_v2f16_to_v2i32(
86 <2 x i32> addrspace(1)* %r,
87 <2 x half> addrspace(1)* %a) {
88 entry:
89 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
90 %r.val = fptosi <2 x half> %a.val to <2 x i32>
91 store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r
92 ret void
93 }
94
95 ; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
96 ; test checks code generated for 'i64 = fp_to_sint f32'.
97
98 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64
99 ; GCN: buffer_load_dword
100 ; GCN: v_cvt_f32_f16_e32
101 ; GCN: v_cvt_f32_f16_e32
102 ; GCN: s_endpgm
103 define void @fptosi_v2f16_to_v2i64(
104 <2 x i64> addrspace(1)* %r,
105 <2 x half> addrspace(1)* %a) {
106 entry:
107 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
108 %r.val = fptosi <2 x half> %a.val to <2 x i64>
109 store <2 x i64> %r.val, <2 x i64> addrspace(1)* %r
110 ret void
111 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; GCN-LABEL: {{^}}fptoui_f16_to_i16
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
6 ; SI: v_cvt_u32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
7 ; VI: v_cvt_u16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
8 ; GCN: buffer_store_short v[[R_I16]]
9 ; GCN: s_endpgm
10 define void @fptoui_f16_to_i16(
11 i16 addrspace(1)* %r,
12 half addrspace(1)* %a) {
13 entry:
14 %a.val = load half, half addrspace(1)* %a
15 %r.val = fptoui half %a.val to i16
16 store i16 %r.val, i16 addrspace(1)* %r
17 ret void
18 }
19
20 ; GCN-LABEL: {{^}}fptoui_f16_to_i32
21 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
22 ; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
23 ; GCN: v_cvt_u32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
24 ; GCN: buffer_store_dword v[[R_I32]]
25 ; GCN: s_endpgm
26 define void @fptoui_f16_to_i32(
27 i32 addrspace(1)* %r,
28 half addrspace(1)* %a) {
29 entry:
30 %a.val = load half, half addrspace(1)* %a
31 %r.val = fptoui half %a.val to i32
32 store i32 %r.val, i32 addrspace(1)* %r
33 ret void
34 }
35
36 ; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
37 ; test checks code generated for 'i64 = fp_to_uint f32'.
38
39 ; GCN-LABEL: {{^}}fptoui_f16_to_i64
40 ; GCN: buffer_load_ushort
41 ; GCN: v_cvt_f32_f16_e32
42 ; GCN: s_endpgm
43 define void @fptoui_f16_to_i64(
44 i64 addrspace(1)* %r,
45 half addrspace(1)* %a) {
46 entry:
47 %a.val = load half, half addrspace(1)* %a
48 %r.val = fptoui half %a.val to i64
49 store i64 %r.val, i64 addrspace(1)* %r
50 ret void
51 }
52
53 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16
54 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
55 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
56 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
57 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
58 ; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
59 ; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
60 ; VI: v_cvt_u16_f16_e32 v[[R_I16_0:[0-9]+]], v[[A_V2_F16]]
61 ; VI: v_cvt_u16_f16_e32 v[[R_I16_1:[0-9]+]], v[[A_F16_1]]
62 ; VI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
63 ; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
64 ; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
65 ; VI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
66 ; GCN: buffer_store_dword v[[R_V2_I16]]
67 ; GCN: s_endpgm
68 define void @fptoui_v2f16_to_v2i16(
69 <2 x i16> addrspace(1)* %r,
70 <2 x half> addrspace(1)* %a) {
71 entry:
72 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
73 %r.val = fptoui <2 x half> %a.val to <2 x i16>
74 store <2 x i16> %r.val, <2 x i16> addrspace(1)* %r
75 ret void
76 }
77
78 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i32
79 ; GCN: buffer_load_dword
80 ; GCN: v_cvt_f32_f16_e32
81 ; GCN: v_cvt_f32_f16_e32
82 ; GCN: v_cvt_u32_f32_e32
83 ; GCN: v_cvt_u32_f32_e32
84 ; GCN: buffer_store_dwordx2
85 ; GCN: s_endpgm
86 define void @fptoui_v2f16_to_v2i32(
87 <2 x i32> addrspace(1)* %r,
88 <2 x half> addrspace(1)* %a) {
89 entry:
90 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
91 %r.val = fptoui <2 x half> %a.val to <2 x i32>
92 store <2 x i32> %r.val, <2 x i32> addrspace(1)* %r
93 ret void
94 }
95
96 ; Need to make sure we promote f16 to f32 when converting f16 to i64. Existing
97 ; test checks code generated for 'i64 = fp_to_uint f32'.
98
99 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64
100 ; GCN: buffer_load_dword
101 ; GCN: v_cvt_f32_f16_e32
102 ; GCN: v_cvt_f32_f16_e32
103 ; GCN: s_endpgm
104 define void @fptoui_v2f16_to_v2i64(
105 <2 x i64> addrspace(1)* %r,
106 <2 x half> addrspace(1)* %a) {
107 entry:
108 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
109 %r.val = fptoui <2 x half> %a.val to <2 x i64>
110 store <2 x i64> %r.val, <2 x i64> addrspace(1)* %r
111 ret void
112 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; GCN-LABEL: {{^}}fptrunc_f32_to_f16
4 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
5 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
6 ; GCN: buffer_store_short v[[R_F16]]
7 ; GCN: s_endpgm
8 define void @fptrunc_f32_to_f16(
9 half addrspace(1)* %r,
10 float addrspace(1)* %a) {
11 entry:
12 %a.val = load float, float addrspace(1)* %a
13 %r.val = fptrunc float %a.val to half
14 store half %r.val, half addrspace(1)* %r
15 ret void
16 }
17
18 ; GCN-LABEL: {{^}}fptrunc_f64_to_f16
19 ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}}
20 ; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}}
21 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
22 ; GCN: buffer_store_short v[[R_F16]]
23 ; GCN: s_endpgm
24 define void @fptrunc_f64_to_f16(
25 half addrspace(1)* %r,
26 double addrspace(1)* %a) {
27 entry:
28 %a.val = load double, double addrspace(1)* %a
29 %r.val = fptrunc double %a.val to half
30 store half %r.val, half addrspace(1)* %r
31 ret void
32 }
33
34 ; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16
35 ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}}
36 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
37 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
38 ; GCN-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
39 ; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
40 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
41 ; GCN: buffer_store_dword v[[R_V2_F16]]
42 ; GCN: s_endpgm
43 define void @fptrunc_v2f32_to_v2f16(
44 <2 x half> addrspace(1)* %r,
45 <2 x float> addrspace(1)* %a) {
46 entry:
47 %a.val = load <2 x float>, <2 x float> addrspace(1)* %a
48 %r.val = fptrunc <2 x float> %a.val to <2 x half>
49 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
50 ret void
51 }
52
53 ; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16
54 ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}}
55 ; GCN: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
56 ; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
57 ; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
58 ; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
59 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
60 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
61 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
62 ; GCN: buffer_store_dword v[[R_V2_F16]]
63 define void @fptrunc_v2f64_to_v2f16(
64 <2 x half> addrspace(1)* %r,
65 <2 x double> addrspace(1)* %a) {
66 entry:
67 %a.val = load <2 x double>, <2 x double> addrspace(1)* %a
68 %r.val = fptrunc <2 x double> %a.val to <2 x half>
69 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
70 ret void
71 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2
3 ; GCN-LABEL: {{^}}fsub_f16
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
6 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
7 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
8 ; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
9 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
10 ; VI: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
11 ; GCN: buffer_store_short v[[R_F16]]
12 ; GCN: s_endpgm
13 define void @fsub_f16(
14 half addrspace(1)* %r,
15 half addrspace(1)* %a,
16 half addrspace(1)* %b) {
17 entry:
18 %a.val = load half, half addrspace(1)* %a
19 %b.val = load half, half addrspace(1)* %b
20 %r.val = fsub half %a.val, %b.val
21 store half %r.val, half addrspace(1)* %r
22 ret void
23 }
24
25 ; GCN-LABEL: {{^}}fsub_f16_imm_a
26 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
27 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], 0x3c00{{$}}
28 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
29 ; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
30 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
31 ; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]]
32 ; GCN: buffer_store_short v[[R_F16]]
33 ; GCN: s_endpgm
34 define void @fsub_f16_imm_a(
35 half addrspace(1)* %r,
36 half addrspace(1)* %b) {
37 entry:
38 %b.val = load half, half addrspace(1)* %b
39 %r.val = fsub half 1.0, %b.val
40 store half %r.val, half addrspace(1)* %r
41 ret void
42 }
43
44 ; GCN-LABEL: {{^}}fsub_f16_imm_b
45 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
46 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], 0xc000{{$}}
47 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
48 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
49 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
50 ; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0xc000, v[[A_F16]]
51 ; GCN: buffer_store_short v[[R_F16]]
52 ; GCN: s_endpgm
53 define void @fsub_f16_imm_b(
54 half addrspace(1)* %r,
55 half addrspace(1)* %a) {
56 entry:
57 %a.val = load half, half addrspace(1)* %a
58 %r.val = fsub half %a.val, 2.0
59 store half %r.val, half addrspace(1)* %r
60 ret void
61 }
62
63 ; GCN-LABEL: {{^}}fsub_v2f16
64 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
65 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
66 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
67 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
68 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
69 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
70 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
71 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
74 ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
75 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
76 ; VI: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
77 ; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
78 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
79 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
80 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
81 ; GCN: buffer_store_dword v[[R_V2_F16]]
82 ; GCN: s_endpgm
83 define void @fsub_v2f16(
84 <2 x half> addrspace(1)* %r,
85 <2 x half> addrspace(1)* %a,
86 <2 x half> addrspace(1)* %b) {
87 entry:
88 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
89 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
90 %r.val = fsub <2 x half> %a.val, %b.val
91 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
92 ret void
93 }
94
95 ; GCN-LABEL: {{^}}fsub_v2f16_imm_a
96 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
97 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], 0x3c00{{$}}
98 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], 0x4000{{$}}
99 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
100 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
101 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
102 ; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
103 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
104 ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
105 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
106 ; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]]
107 ; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]]
108 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
109 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
110 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
111 ; GCN: buffer_store_dword v[[R_V2_F16]]
112 ; GCN: s_endpgm
113 define void @fsub_v2f16_imm_a(
114 <2 x half> addrspace(1)* %r,
115 <2 x half> addrspace(1)* %b) {
116 entry:
117 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
118 %r.val = fsub <2 x half> , %b.val
119 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
120 ret void
121 }
122
123 ; GCN-LABEL: {{^}}fsub_v2f16_imm_b
124 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
125 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], 0x4000{{$}}
126 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], 0x3c00{{$}}
127 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
128 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
129 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
130 ; SI: v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
131 ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
132 ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
133 ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
134 ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0xc000, v[[A_V2_F16]]
135 ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0xbc00, v[[A_F16_1]]
136 ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
137 ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
138 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
139 ; GCN: buffer_store_dword v[[R_V2_F16]]
140 ; GCN: s_endpgm
141 define void @fsub_v2f16_imm_b(
142 <2 x half> addrspace(1)* %r,
143 <2 x half> addrspace(1)* %a) {
144 entry:
145 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
146 %r.val = fsub <2 x half> %a.val,
147 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
148 ret void
149 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
3 ; half args should be promoted to float
3 ; half args should be promoted to float for SI and lower.
44
55 ; GCN-LABEL: {{^}}load_f16_arg:
66 ; GCN: s_load_dword [[ARG:s[0-9]+]]
7 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
7 ; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
8 ; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]]
89 ; GCN: buffer_store_short [[CVT]]
910 define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
1011 store half %arg, half addrspace(1)* %out
130131
131132 ; GCN-LABEL: {{^}}extload_f16_to_f64_arg:
132133 ; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
134 ; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]]
133135 ; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}}
134 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]]
136 ; VI: v_trunc_f16_e32 v[[VARG:[0-9]+]], [[ARG]]
137 ; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]]
138 ; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]]
135139 ; GCN: buffer_store_dwordx2 [[RESULT]]
136140 define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
137141 %ext = fpext half %arg to double
278282
279283 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
280284 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
285 ; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
281286 ; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
282 ; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
287 ; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
283288 ; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
284289 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
285290 ; GCN: s_endpgm
386391 ; XSI-NOT: v_cvt_f32_f16
387392
388393 ; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
394 ; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
389395 ; XVI: v_cvt_f32_f16_e32
390396 ; XVI: v_cvt_f32_f16_e32
391 ; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
392397 ; XVI: v_cvt_f32_f16_e32
393398 ; XVI-NOT: v_cvt_f32_f16
394399
395400 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
401 ; VI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
396402 ; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
397403 ; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
398 ; GCN: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
404 ; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
399405 ; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
400406
401407 ; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
600606 ret void
601607 }
602608
603 ; GCN-LABEL: {{^}}fsub_f16:
604 ; GCN: v_subrev_f32_e32
605 ; GCN: s_endpgm
606 define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
607 %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1
608 %a = load half, half addrspace(1)* %in
609 %b = load half, half addrspace(1)* %b_ptr
610 %sub = fsub half %a, %b
611 store half %sub, half addrspace(1)* %out
612 ret void
613 }
614
615609 ; GCN-LABEL: {{^}}test_bitcast_from_half:
616610 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
617611 ; GCN: buffer_store_short [[TMP]]
0 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
1
2 declare half @llvm.fabs.f16(half %a)
3 declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
4
5 ; GCN-LABEL: {{^}}class_f16
6 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
7 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
8 ; VI: v_cmp_class_f16_e32 vcc, v[[A_F16]], v[[B_I32]]
9 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
10 ; GCN: buffer_store_dword v[[R_I32]]
11 ; GCN: s_endpgm
12 define void @class_f16(
13 i32 addrspace(1)* %r,
14 half addrspace(1)* %a,
15 i32 addrspace(1)* %b) {
16 entry:
17 %a.val = load half, half addrspace(1)* %a
18 %b.val = load i32, i32 addrspace(1)* %b
19 %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 %b.val)
20 %r.val.sext = sext i1 %r.val to i32
21 store i32 %r.val.sext, i32 addrspace(1)* %r
22 ret void
23 }
24
25 ; GCN-LABEL: {{^}}class_f16_fabs
26 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
27 ; GCN: s_load_dword s[[SB_I32:[0-9]+]]
28 ; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]]
29 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |v[[VA_F16]]|, s[[SB_I32]]
30 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
31 ; GCN: buffer_store_dword v[[VR_I32]]
32 ; GCN: s_endpgm
33 define void @class_f16_fabs(
34 i32 addrspace(1)* %r,
35 half %a.val,
36 i32 %b.val) {
37 entry:
38 %a.val.fabs = call half @llvm.fabs.f16(half %a.val)
39 %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fabs, i32 %b.val)
40 %r.val.sext = sext i1 %r.val to i32
41 store i32 %r.val.sext, i32 addrspace(1)* %r
42 ret void
43 }
44
45 ; GCN-LABEL: {{^}}class_f16_fneg
46 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
47 ; GCN: s_load_dword s[[SB_I32:[0-9]+]]
48 ; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]]
49 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -v[[VA_F16]], s[[SB_I32]]
50 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
51 ; GCN: buffer_store_dword v[[VR_I32]]
52 ; GCN: s_endpgm
53 define void @class_f16_fneg(
54 i32 addrspace(1)* %r,
55 half %a.val,
56 i32 %b.val) {
57 entry:
58 %a.val.fneg = fsub half -0.0, %a.val
59 %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fneg, i32 %b.val)
60 %r.val.sext = sext i1 %r.val to i32
61 store i32 %r.val.sext, i32 addrspace(1)* %r
62 ret void
63 }
64
65 ; GCN-LABEL: {{^}}class_f16_fabs_fneg
66 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
67 ; GCN: s_load_dword s[[SB_I32:[0-9]+]]
68 ; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]]
69 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|v[[VA_F16]]|, s[[SB_I32]]
70 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
71 ; GCN: buffer_store_dword v[[VR_I32]]
72 ; GCN: s_endpgm
73 define void @class_f16_fabs_fneg(
74 i32 addrspace(1)* %r,
75 half %a.val,
76 i32 %b.val) {
77 entry:
78 %a.val.fabs = call half @llvm.fabs.f16(half %a.val)
79 %a.val.fabs.fneg = fsub half -0.0, %a.val.fabs
80 %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val.fabs.fneg, i32 %b.val)
81 %r.val.sext = sext i1 %r.val to i32
82 store i32 %r.val.sext, i32 addrspace(1)* %r
83 ret void
84 }
85
86 ; GCN-LABEL: {{^}}class_f16_1
87 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
88 ; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]]
89 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[VA_F16]], 1{{$}}
90 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
91 ; GCN: buffer_store_dword v[[VR_I32]]
92 ; GCN: s_endpgm
93 define void @class_f16_1(
94 i32 addrspace(1)* %r,
95 half %a.val) {
96 entry:
97 %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 1)
98 %r.val.sext = sext i1 %r.val to i32
99 store i32 %r.val.sext, i32 addrspace(1)* %r
100 ret void
101 }
102
103 ; GCN-LABEL: {{^}}class_f16_64
104 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
105 ; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]]
106 ; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[VA_F16]], 64{{$}}
107 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
108 ; GCN: buffer_store_dword v[[VR_I32]]
109 ; GCN: s_endpgm
110 define void @class_f16_64(
111 i32 addrspace(1)* %r,
112 half %a.val) {
113 entry:
114 %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 64)
115 %r.val.sext = sext i1 %r.val to i32
116 store i32 %r.val.sext, i32 addrspace(1)* %r
117 ret void
118 }
119
120 ; GCN-LABEL: {{^}}class_f16_full_mask
121 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
122 ; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x3ff{{$}}
123 ; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]]
124 ; VI: v_cmp_class_f16_e32 vcc, v[[VA_F16]], v[[MASK]]
125 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
126 ; GCN: buffer_store_dword v[[VR_I32]]
127 ; GCN: s_endpgm
128 define void @class_f16_full_mask(
129 i32 addrspace(1)* %r,
130 half %a.val) {
131 entry:
132 %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 1023)
133 %r.val.sext = sext i1 %r.val to i32
134 store i32 %r.val.sext, i32 addrspace(1)* %r
135 ret void
136 }
137
138 ; GCN-LABEL: {{^}}class_f16_nine_bit_mask
139 ; GCN: s_load_dword s[[SA_F16:[0-9]+]]
140 ; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x1ff{{$}}
141 ; VI: v_trunc_f16_e32 v[[VA_F16:[0-9]+]], s[[SA_F16]]
142 ; VI: v_cmp_class_f16_e32 vcc, v[[VA_F16]], v[[MASK]]
143 ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
144 ; GCN: buffer_store_dword v[[VR_I32]]
145 ; GCN: s_endpgm
146 define void @class_f16_nine_bit_mask(
147 i32 addrspace(1)* %r,
148 half %a.val) {
149 entry:
150 %r.val = call i1 @llvm.amdgcn.class.f16(half %a.val, i32 511)
151 %r.val.sext = sext i1 %r.val to i32
152 store i32 %r.val.sext, i32 addrspace(1)* %r
153 ret void
154 }
0 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
1
2 declare half @llvm.amdgcn.cos.f16(half %a)
3
4 ; GCN-LABEL: {{^}}cos_f16
5 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
6 ; VI: v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
7 ; GCN: buffer_store_short v[[R_F16]]
8 ; GCN: s_endpgm
9 define void @cos_f16(
10 half addrspace(1)* %r,
11 half addrspace(1)* %a) {
12 entry:
13 %a.val = load half, half addrspace(1)* %a
14 %r.val = call half @llvm.amdgcn.cos.f16(half %a.val)
15 store half %r.val, half addrspace(1)* %r
16 ret void
17 }
0 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
1
2 declare half @llvm.amdgcn.div.fixup.f16(half %a, half %b, half %c)
3
4 ; GCN-LABEL: {{^}}div_fixup_f16
5 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
6 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
7 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
8 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
9 ; GCN: buffer_store_short v[[R_F16]]
10 ; GCN: s_endpgm
11 define void @div_fixup_f16(
12 half addrspace(1)* %r,
13 half addrspace(1)* %a,
14 half addrspace(1)* %b,
15 half addrspace(1)* %c) {
16 entry:
17 %a.val = load half, half addrspace(1)* %a
18 %b.val = load half, half addrspace(1)* %b
19 %c.val = load half, half addrspace(1)* %c
20 %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val)
21 store half %r.val, half addrspace(1)* %r
22 ret void
23 }
24
25 ; GCN-LABEL: {{^}}div_fixup_f16_imm_a
26 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
27 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
28 ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
29 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
30 ; GCN: buffer_store_short v[[R_F16]]
31 ; GCN: s_endpgm
32 define void @div_fixup_f16_imm_a(
33 half addrspace(1)* %r,
34 half addrspace(1)* %b,
35 half addrspace(1)* %c) {
36 entry:
37 %b.val = load half, half addrspace(1)* %b
38 %c.val = load half, half addrspace(1)* %c
39 %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val)
40 store half %r.val, half addrspace(1)* %r
41 ret void
42 }
43
44 ; GCN-LABEL: {{^}}div_fixup_f16_imm_b
45 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
46 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
47 ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
48 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
49 ; GCN: buffer_store_short v[[R_F16]]
50 ; GCN: s_endpgm
51 define void @div_fixup_f16_imm_b(
52 half addrspace(1)* %r,
53 half addrspace(1)* %a,
54 half addrspace(1)* %c) {
55 entry:
56 %a.val = load half, half addrspace(1)* %a
57 %c.val = load half, half addrspace(1)* %c
58 %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val)
59 store half %r.val, half addrspace(1)* %r
60 ret void
61 }
62
63 ; GCN-LABEL: {{^}}div_fixup_f16_imm_c
64 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
65 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
66 ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
67 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
68 ; GCN: buffer_store_short v[[R_F16]]
69 ; GCN: s_endpgm
70 define void @div_fixup_f16_imm_c(
71 half addrspace(1)* %r,
72 half addrspace(1)* %a,
73 half addrspace(1)* %b) {
74 entry:
75 %a.val = load half, half addrspace(1)* %a
76 %b.val = load half, half addrspace(1)* %b
77 %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0)
78 store half %r.val, half addrspace(1)* %r
79 ret void
80 }
81
82 ; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_b
83 ; VI: v_mov_b32_e32 v[[AB_F16:[0-9]+]], 0x4200{{$}}
84 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
85 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]]
86 ; GCN: buffer_store_short v[[R_F16]]
87 ; GCN: s_endpgm
88 define void @div_fixup_f16_imm_a_imm_b(
89 half addrspace(1)* %r,
90 half addrspace(1)* %c) {
91 entry:
92 %c.val = load half, half addrspace(1)* %c
93 %r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val)
94 store half %r.val, half addrspace(1)* %r
95 ret void
96 }
97
98 ; GCN-LABEL: {{^}}div_fixup_f16_imm_b_imm_c
99 ; VI: v_mov_b32_e32 v[[BC_F16:[0-9]+]], 0x4200{{$}}
100 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
101 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]]
102 ; GCN: buffer_store_short v[[R_F16]]
103 ; GCN: s_endpgm
104 define void @div_fixup_f16_imm_b_imm_c(
105 half addrspace(1)* %r,
106 half addrspace(1)* %a) {
107 entry:
108 %a.val = load half, half addrspace(1)* %a
109 %r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half 3.0)
110 store half %r.val, half addrspace(1)* %r
111 ret void
112 }
113
114 ; GCN-LABEL: {{^}}div_fixup_f16_imm_a_imm_c
115 ; VI: v_mov_b32_e32 v[[AC_F16:[0-9]+]], 0x4200{{$}}
116 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
117 ; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]]
118 ; GCN: buffer_store_short v[[R_F16]]
119 ; GCN: s_endpgm
120 define void @div_fixup_f16_imm_a_imm_c(