llvm.org GIT mirror llvm / 820b8a5
AMDGPU: Start selecting v_mad_mixhi_f16 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@313814 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
6 changed file(s) with 312 addition(s) and 46 deletion(s). Raw diff Collapse all Expand all
774774 return true;
775775 }
776776
777 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
777778 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
778779 const MachineOperand *ClampSrc = isClamp(MI);
779780 if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
501501 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
502502 setTargetDAGCombine(ISD::ZERO_EXTEND);
503503 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
504 setTargetDAGCombine(ISD::BUILD_VECTOR);
504505
505506 // All memory operations. Some folding on the pointer operand is done to help
506507 // matching the constant offsets in the addressing modes.
58525853 SDNode *N, DAGCombinerInfo &DCI) const {
58535854 SDValue Vec = N->getOperand(0);
58545855
5855 SelectionDAG &DAG= DCI.DAG;
5856 SelectionDAG &DAG = DCI.DAG;
58565857 if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
58575858 SDLoc SL(N);
58585859 EVT EltVT = N->getValueType(0);
58655866 return SDValue();
58665867 }
58675868
5869 static bool convertBuildVectorCastElt(SelectionDAG &DAG,
5870 SDValue &Lo, SDValue &Hi) {
5871 if (Hi.getOpcode() == ISD::BITCAST &&
5872 Hi.getOperand(0).getValueType() == MVT::f16 &&
5873 (isa(Lo) || Lo.isUndef())) {
5874 Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
5875 Hi = Hi.getOperand(0);
5876 return true;
5877 }
5878
5879 return false;
5880 }
5881
5882 SDValue SITargetLowering::performBuildVectorCombine(
5883 SDNode *N, DAGCombinerInfo &DCI) const {
5884 SDLoc SL(N);
5885
5886 if (!isTypeLegal(MVT::v2i16))
5887 return SDValue();
5888 SelectionDAG &DAG = DCI.DAG;
5889 EVT VT = N->getValueType(0);
5890
5891 if (VT == MVT::v2i16) {
5892 SDValue Lo = N->getOperand(0);
5893 SDValue Hi = N->getOperand(1);
5894
5895 // v2i16 build_vector (const|undef), (bitcast f16:$x)
5896 // -> bitcast (v2f16 build_vector const|undef, $x
5897 if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
5898 SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
5899 return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
5900 }
5901
5902 if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
5903 SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
5904 return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
5905 }
5906 }
5907
5908 return SDValue();
5909 }
58685910
58695911 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
58705912 const SDNode *N0,
62866328 }
62876329 case ISD::EXTRACT_VECTOR_ELT:
62886330 return performExtractVectorEltCombine(N, DCI);
6331 case ISD::BUILD_VECTOR:
6332 return performBuildVectorCombine(N, DCI);
62896333 }
62906334 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
62916335 }
110110 SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
111111 SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
112112 SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
113 SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
113114
114115 unsigned getFusedOpcode(const SelectionDAG &DAG,
115116 const SDNode *N0, const SDNode *N1) const;
7575
7676 // Clamp modifier is applied after conversion to f16.
7777 def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile, 1>;
78
79 let ClampLo = 0, ClampHi = 1 in {
7880 def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile, 1>;
81 }
7982 }
8083
8184 let Predicates = [HasMadMix] in {
8790 (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
8891 $src1_modifiers, $src1,
8992 $src2_modifiers, $src2,
90 0,
93 DSTCLAMP.NONE,
9194 (i32 (IMPLICIT_DEF)))
95 >;
96
97 // FIXME: Special case handling for maxhi (especially for clamp)
98 // because dealing with the write to high half of the register is
99 // difficult.
100 def : Pat <
101 (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
102 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
103 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
104 (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
105 $src1_modifiers, $src1,
106 $src2_modifiers, $src2,
107 DSTCLAMP.NONE,
108 $elt0))
109 >;
110
111 def : Pat <
112 (build_vector
113 f16:$elt0,
114 (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
115 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
116 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
117 (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0,
118 $src1_modifiers, $src1,
119 $src2_modifiers, $src2,
120 DSTCLAMP.ENABLE,
121 $elt0))
122 >;
123
124 def : Pat <
125 (AMDGPUclamp (build_vector
126 (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
127 (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
128 (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
129 (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
130 (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
131 (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
132 (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0,
133 $hi_src1_modifiers, $hi_src1,
134 $hi_src2_modifiers, $hi_src2,
135 DSTCLAMP.ENABLE,
136 (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0,
137 $lo_src1_modifiers, $lo_src1,
138 $lo_src2_modifiers, $lo_src2,
139 DSTCLAMP.ENABLE,
140 (i32 (IMPLICIT_DEF)))))
92141 >;
93142
94143 } // End Predicates = [HasMadMix]
11 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s
22 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
33
4 ; FIXME: These cases should be able to use v_mad_mixhi_f16 and avoid
5 ; the packing.
6
74 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
8 ; GFX9: v_mad_mixlo_f16
9 ; GFX9: v_lshl_or_b32
5 ; GFX9: s_waitcnt
6 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2
7 ; GFX9-NEXT: s_setpc_b64
108 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 {
119 %src0.ext = fpext half %src0 to float
1210 %src1.ext = fpext half %src1 to float
1816 }
1917
2018 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
21 ; GFX9: v_mad_mixlo_f16
22 ; GFX9: v_lshl_or_b32
19 ; GFX9: s_waitcnt
20 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00
21 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2
22 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
23 ; GFX9-NEXT: s_setpc_b64
2324 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 {
2425 %src0.ext = fpext half %src0 to float
2526 %src1.ext = fpext half %src1 to float
3132 }
3233
3334 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
34 ; GFX9: v_mad_mixlo_f16
35 ; GFX9: v_lshl_or_b32
35 ; GFX9: s_waitcnt
36 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2
37 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
38 ; GFX9-NEXT: s_setpc_b64
3639 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
3740 %src0.ext = fpext half %src0 to float
3841 %src1.ext = fpext half %src1 to float
4548 }
4649
4750 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
48 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2
49 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
51 ; GFX9: v_mov_b32_e32 v3, 0
52 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2
53 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
54 ; GFX9-NEXT: s_setpc_b64
5055 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
5156 %src0.ext = fpext half %src0 to float
5257 %src1.ext = fpext half %src1 to float
6065 }
6166
6267 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
63 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2
64 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
68 ; GFX9: v_mov_b32_e32 v3, 0
69 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2
70 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
71 ; GFX9-NEXT: s_setpc_b64
6572 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
6673 %src0.ext = fpext half %src0 to float
6774 %src1.ext = fpext half %src1 to float
8996 ret <2 x half> %vec.result
9097 }
9198
92 ; FIXME: Unnecessary junk to pack, and packing undef?
9399 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
94 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2 clamp{{$}}
95 ; GFX9-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
96 ; GFX9-NEXT: v_and_b32_e32 [[AND:v[0-9]+]], s6, [[MASK]]
97 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, [[AND]]
100 ; GCN: s_waitcnt
101 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 clamp{{$}}
98102 ; GFX9-NEXT: s_setpc_b64
99103 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 {
100104 %src0.ext = fpext half %src0 to float
102106 %src2.ext = fpext half %src2 to float
103107 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
104108 %cvt.result = fptrunc float %result to half
109 %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0)
110 %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
111 %vec.result = insertelement <2 x half> undef, half %clamp, i32 1
112 ret <2 x half> %vec.result
113 }
114
115
116 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
117 ; GCN: s_waitcnt
118 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2{{$}}
119 ; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3
120 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 clamp{{$}}
121 ; GFX9-NEXT: s_waitcnt vmcnt(0)
122 ; GFX9-NEXT: s_setpc_b64
123 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
124 %src0.ext = fpext half %src0 to float
125 %src1.ext = fpext half %src1 to float
126 %src2.ext = fpext half %src2 to float
127 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
128 %cvt.result = fptrunc float %result to half
129 store volatile half %cvt.result, half addrspace(1)* undef
105130 %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0)
106131 %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
107132 %vec.result = insertelement <2 x half> undef, half %clamp, i32 1
7474 ret half %cvt.result
7575 }
7676
77 ; GCN-LABEL: {{^}}v_mad_mixlo_v2f32:
78 ; GFX9: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1]
79 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2
80 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
81 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
82 ; GFX9-NEXT: s_setpc_b64
83 define <2 x half> @v_mad_mixlo_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
77 ; FIXME: Should abe able to avoid extra register because first
78 ; operation only clobbers relevant lane.
79 ; GCN-LABEL: {{^}}v_mad_mix_v2f32:
80 ; GCN: s_waitcnt
81 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2{{$}}
82 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1]{{$}}
83 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
84 ; GFX9-NEXT: s_setpc_b64
85 define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
8486 %src0.ext = fpext <2 x half> %src0 to <2 x float>
8587 %src1.ext = fpext <2 x half> %src1 to <2 x float>
8688 %src2.ext = fpext <2 x half> %src2 to <2 x float>
8991 ret <2 x half> %cvt.result
9092 }
9193
92 ; GCN-LABEL: {{^}}v_mad_mixlo_v3f32:
94 ; GCN-LABEL: {{^}}v_mad_mix_v3f32:
9395 ; GCN: s_waitcnt
9496 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6
9597 ; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v4, v7
9698 ; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8
9799 ; GFX9-NEXT: s_setpc_b64
98 define <3 x half> @v_mad_mixlo_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
100 define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
99101 %src0.ext = fpext <3 x half> %src0 to <3 x float>
100102 %src1.ext = fpext <3 x half> %src1 to <3 x float>
101103 %src2.ext = fpext <3 x half> %src2 to <3 x float>
104106 ret <3 x half> %cvt.result
105107 }
106108
107 ; GCN-LABEL: {{^}}v_mad_mixlo_v4f32:
108 ; GCN: s_waitcnt
109 ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1]
110 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v2, v4
111 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
112 ; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v3, v5 op_sel:[1,1,1]
113 ; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5
114 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
115 ; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
116 ; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0
117 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1
118 ; GFX9-NEXT: s_setpc_b64
119 define <4 x half> @v_mad_mixlo_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
109 ; GCN-LABEL: {{^}}v_mad_mix_v4f32:
110 ; GCN: s_waitcnt
111 ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4
112 ; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1]
113 ; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5
114 ; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1]
115 ; GFX9-NEXT: v_mov_b32_e32 v0, v6
116 ; GFX9-NEXT: v_mov_b32_e32 v1, v2
117 ; GFX9-NEXT: s_setpc_b64
118 define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
120119 %src0.ext = fpext <4 x half> %src0 to <4 x float>
121120 %src1.ext = fpext <4 x half> %src1 to <4 x float>
122121 %src2.ext = fpext <4 x half> %src2 to <4 x float>
127126
128127 ; FIXME: Fold clamp
129128 ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt:
130 ; GFX9: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1]
131 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2
132 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]]
133 ; GFX9: v_pk_max_f16 v0, [[PACKED]], [[PACKED]] clamp{{$}}
129 ; GFX9: v_mad_mixlo_f16 v3, v0, v1, v2 clamp{{$}}
130 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] clamp{{$}}
131 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
134132 ; GFX9-NEXT: s_setpc_b64
135133 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
136134 %src0.ext = fpext <2 x half> %src0 to <2 x float>
143141 ret <2 x half> %clamp
144142 }
145143
144 ; FIXME: Should be packed into 2 registers per argument?
145 ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt:
146 ; GCN: s_waitcnt
147 ; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8 clamp
148 ; GFX9-NEXT: v_mad_mixhi_f16 v2, v0, v0, v0 op_sel_hi:[0,0,0] clamp
149 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6 clamp
150 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v1, v4, v7 clamp
151 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
152 ; GFX9-NEXT: s_setpc_b64
153 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
154 %src0.ext = fpext <3 x half> %src0 to <3 x float>
155 %src1.ext = fpext <3 x half> %src1 to <3 x float>
156 %src2.ext = fpext <3 x half> %src2 to <3 x float>
157 %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
158 %cvt.result = fptrunc <3 x float> %result to <3 x half>
159 %max = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %cvt.result, <3 x half> zeroinitializer)
160 %clamp = call <3 x half> @llvm.minnum.v3f16(<3 x half> %max, <3 x half> )
161 ret <3 x half> %clamp
162 }
163
164 ; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt:
165 ; GCN: s_waitcnt
166 ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 clamp
167 ; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] clamp
168 ; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 clamp
169 ; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] clamp
170 ; GFX9-DAG: v_mov_b32_e32 v0, v6
171 ; GFX9-DAG: v_mov_b32_e32 v1, v2
172 ; GFX9: s_setpc_b64
173 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
174 %src0.ext = fpext <4 x half> %src0 to <4 x float>
175 %src1.ext = fpext <4 x half> %src1 to <4 x float>
176 %src2.ext = fpext <4 x half> %src2 to <4 x float>
177 %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
178 %cvt.result = fptrunc <4 x float> %result to <4 x half>
179 %max = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %cvt.result, <4 x half> zeroinitializer)
180 %clamp = call <4 x half> @llvm.minnum.v4f16(<4 x half> %max, <4 x half> )
181 ret <4 x half> %clamp
182 }
183
184 ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_lo:
185 ; GCN: s_waitcnt
186 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 clamp
187 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1]
188 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
189 ; GFX9-NEXT: s_setpc_b64
190 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
191 %src0.ext = fpext <2 x half> %src0 to <2 x float>
192 %src1.ext = fpext <2 x half> %src1 to <2 x float>
193 %src2.ext = fpext <2 x half> %src2 to <2 x float>
194 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
195 %cvt.result = fptrunc <2 x float> %result to <2 x half>
196 %cvt.lo = extractelement <2 x half> %cvt.result, i32 0
197 %max.lo = call half @llvm.maxnum.f16(half %cvt.lo, half 0.0)
198 %clamp.lo = call half @llvm.minnum.f16(half %max.lo, half 1.0)
199 %insert = insertelement <2 x half> %cvt.result, half %clamp.lo, i32 0
200 ret <2 x half> %insert
201 }
202
203 ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_hi:
204 ; GCN: s_waitcnt
205 ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2
206 ; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] clamp
207 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
208 ; GFX9-NEXT: s_setpc_b64
209 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
210 %src0.ext = fpext <2 x half> %src0 to <2 x float>
211 %src1.ext = fpext <2 x half> %src1 to <2 x float>
212 %src2.ext = fpext <2 x half> %src2 to <2 x float>
213 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
214 %cvt.result = fptrunc <2 x float> %result to <2 x half>
215 %cvt.hi = extractelement <2 x half> %cvt.result, i32 1
216 %max.hi = call half @llvm.maxnum.f16(half %cvt.hi, half 0.0)
217 %clamp.hi = call half @llvm.minnum.f16(half %max.hi, half 1.0)
218 %insert = insertelement <2 x half> %cvt.result, half %clamp.hi, i32 1
219 ret <2 x half> %insert
220 }
221
222 ; FIXME: Should be able to use mixlo/mixhi
223 ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt:
224 ; GFX9: v_mad_mix_f32 v3, v0, v1, v2 clamp
225 ; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] clamp
226 ; GFX9: v_cvt_f16_f32_e32 v1, v3
227 ; GFX9: v_cvt_f16_f32_e32 v0, v0
228 ; GFX9: v_and_b32_e32 v1, 0xffff, v1
229 ; GFX9: v_lshl_or_b32 v0, v0, 16, v1
230 ; GFX9: s_setpc_b64
231 define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
232 %src0.ext = fpext <2 x half> %src0 to <2 x float>
233 %src1.ext = fpext <2 x half> %src1 to <2 x float>
234 %src2.ext = fpext <2 x half> %src2 to <2 x float>
235 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
236 %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer)
237 %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> )
238 %cvt.result = fptrunc <2 x float> %clamp to <2 x half>
239 ret <2 x half> %cvt.result
240 }
241
242 ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt:
243 ; GFX9: v_mad_mix_f32 v0, v0, v3, v6 clamp
244 ; GFX9: v_mad_mix_f32 v1, v1, v4, v7 clamp
245 ; GFX9: v_mad_mix_f32 v2, v2, v5, v8 clamp
246 ; GFX9: v_cvt_f16_f32
247 ; GFX9: v_cvt_f16_f32
248 ; GFX9: v_cvt_f16_f32
249 define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
250 %src0.ext = fpext <3 x half> %src0 to <3 x float>
251 %src1.ext = fpext <3 x half> %src1 to <3 x float>
252 %src2.ext = fpext <3 x half> %src2 to <3 x float>
253 %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
254 %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)
255 %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> )
256 %cvt.result = fptrunc <3 x float> %clamp to <3 x half>
257 ret <3 x half> %cvt.result
258 }
259
260 ; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt:
261 ; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] clamp
262 ; GFX9: v_mad_mix_f32 v0, v0, v2, v4 clamp
263 ; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel:[1,1,1] clamp
264 ; GFX9: v_mad_mix_f32 v1, v1, v3, v5 clamp
265 ; GFX9: v_cvt_f16_f32
266 ; GFX9: v_cvt_f16_f32
267 ; GFX9: v_cvt_f16_f32
268 ; GFX9: v_cvt_f16_f32
269 define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
270 %src0.ext = fpext <4 x half> %src0 to <4 x float>
271 %src1.ext = fpext <4 x half> %src1 to <4 x float>
272 %src2.ext = fpext <4 x half> %src2 to <4 x float>
273 %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
274 %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer)
275 %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> )
276 %cvt.result = fptrunc <4 x float> %clamp to <4 x half>
277 ret <4 x half> %cvt.result
278 }
279
146280 declare half @llvm.minnum.f16(half, half) #1
147281 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
282 declare <3 x half> @llvm.minnum.v3f16(<3 x half>, <3 x half>) #1
283 declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) #1
148284
149285 declare half @llvm.maxnum.f16(half, half) #1
150286 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
287 declare <3 x half> @llvm.maxnum.v3f16(<3 x half>, <3 x half>) #1
288 declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>) #1
151289
152290 declare float @llvm.minnum.f32(float, float) #1
291 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
292 declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1
293 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1
294
153295 declare float @llvm.maxnum.f32(float, float) #1
296 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
297 declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1
298 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1
299
154300 declare float @llvm.fmuladd.f32(float, float, float) #1
155301 declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
156302 declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1