llvm.org GIT mirror llvm / 7287fcb
AMDGPU: Start selecting v_mad_mixlo_f16 Also add some tests that should be able to use v_mad_mixhi_f16, but do not yet. This is trickier because we don't really model the partial update of the register done by 16-bit instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@313806 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
4 changed file(s) with 306 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
196196 bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
197197 SDValue &Clamp) const;
198198 bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
199 bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
199200
200201 void SelectADD_SUB_I64(SDNode *N);
201202 void SelectUADDO_USUBO(SDNode *N);
19871988 }
19881989
19891990 return false;
1991 }
1992
1993 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
1994 SDValue &SrcMods) const {
1995 unsigned Mods = 0;
1996 SelectVOP3PMadMixModsImpl(In, Src, Mods);
1997 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
1998 return true;
19901999 }
19912000
19922001 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
6767 // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
6868 let isCommutable = 1 in {
6969 def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile>;
70
71 // Clamp modifier is applied after conversion to f16.
7072 def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile>;
7173 def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile>;
7274 }
75
76 let Predicates = [HasMadMix] in {
77
78 def : Pat <
79 (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
80 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
81 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
82 (V_MAD_MIXLO_F16 $src0_modifiers, $src0,
83 $src1_modifiers, $src1,
84 $src2_modifiers, $src2,
85 0)
86 >;
87
88 } // End Predicates = [HasMadMix]
7389
7490 multiclass VOP3P_Real_vi op> {
7591 def _vi : VOP3P_Real(NAME), SIEncodingFamily.VI>,
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
3
4 ; FIXME: These cases should be able to use v_mad_mixhi_f16 and avoid
5 ; the packing.
6
7 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
8 ; GFX9: v_mad_mixlo_f16
9 ; GFX9: v_lshl_or_b32
10 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 {
11 %src0.ext = fpext half %src0 to float
12 %src1.ext = fpext half %src1 to float
13 %src2.ext = fpext half %src2 to float
14 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
15 %cvt.result = fptrunc float %result to half
16 %vec.result = insertelement <2 x half> undef, half %cvt.result, i32 1
17 ret <2 x half> %vec.result
18 }
19
20 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
21 ; GFX9: v_mad_mixlo_f16
22 ; GFX9: v_lshl_or_b32
23 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 {
24 %src0.ext = fpext half %src0 to float
25 %src1.ext = fpext half %src1 to float
26 %src2.ext = fpext half %src2 to float
27 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
28 %cvt.result = fptrunc float %result to half
29 %vec.result = insertelement <2 x half> , half %cvt.result, i32 1
30 ret <2 x half> %vec.result
31 }
32
33 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
34 ; GFX9: v_mad_mixlo_f16
35 ; GFX9: v_lshl_or_b32
36 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
37 %src0.ext = fpext half %src0 to float
38 %src1.ext = fpext half %src1 to float
39 %src2.ext = fpext half %src2 to float
40 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
41 %cvt.result = fptrunc float %result to half
42 %vec = insertelement <2 x half> undef, half %lo, i32 0
43 %vec.result = insertelement <2 x half> %vec, half %cvt.result, i32 1
44 ret <2 x half> %vec.result
45 }
46
47 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
48 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2
49 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
50 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
51 %src0.ext = fpext half %src0 to float
52 %src1.ext = fpext half %src1 to float
53 %src2.ext = fpext half %src2 to float
54 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
55 %cvt.result = fptrunc float %result to half
56 %bc = bitcast half %cvt.result to i16
57 %ext = zext i16 %bc to i32
58 %shr = shl i32 %ext, 16
59 ret i32 %shr
60 }
61
62 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
63 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2
64 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
65 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
66 %src0.ext = fpext half %src0 to float
67 %src1.ext = fpext half %src1 to float
68 %src2.ext = fpext half %src2 to float
69 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
70 %cvt.result = fptrunc float %result to half
71 %bc = bitcast half %cvt.result to i16
72 %ext = sext i16 %bc to i32
73 %shr = shl i32 %ext, 16
74 ret i32 %shr
75 }
76
77 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
78 ; GFX9: v_mad_mix_f32 v0, v0, v1, v2 clamp{{$}}
79 ; GFX9: v_cvt_f16_f32_e32 v0, v0
80 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 {
81 %src0.ext = fpext half %src0 to float
82 %src1.ext = fpext half %src1 to float
83 %src2.ext = fpext half %src2 to float
84 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
85 %max = call float @llvm.maxnum.f32(float %result, float 0.0)
86 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
87 %cvt.result = fptrunc float %clamp to half
88 %vec.result = insertelement <2 x half> undef, half %cvt.result, i32 1
89 ret <2 x half> %vec.result
90 }
91
92 ; FIXME: Unnecessary junk to pack, and packing undef?
93 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
94 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2 clamp{{$}}
95 ; GFX9-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
96 ; GFX9-NEXT: v_and_b32_e32 [[AND:v[0-9]+]], s6, [[MASK]]
97 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, [[AND]]
98 ; GFX9-NEXT: s_setpc_b64
99 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 {
100 %src0.ext = fpext half %src0 to float
101 %src1.ext = fpext half %src1 to float
102 %src2.ext = fpext half %src2 to float
103 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
104 %cvt.result = fptrunc float %result to half
105 %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0)
106 %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
107 %vec.result = insertelement <2 x half> undef, half %clamp, i32 1
108 ret <2 x half> %vec.result
109 }
110
111 declare half @llvm.minnum.f16(half, half) #1
112 declare half @llvm.maxnum.f16(half, half) #1
113 declare float @llvm.minnum.f32(float, float) #1
114 declare float @llvm.maxnum.f32(float, float) #1
115 declare float @llvm.fmuladd.f32(float, float, float) #1
116 declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
117
118 attributes #0 = { nounwind }
119 attributes #1 = { nounwind readnone speculatable }
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s
2 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
3
4 ; GCN-LABEL: mixlo_simple:
5 ; GCN: s_waitcnt
6 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[0,0,0]{{$}}
7 ; GFX9-NEXT: s_setpc_b64
8
9 ; CIVI: v_mac_f32_e32
10 ; CIVI: v_cvt_f16_f32_e32
11 define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
12 %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
13 %cvt.result = fptrunc float %result to half
14 ret half %cvt.result
15 }
16
17 ; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f16lo:
18 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2{{$}}
19 ; CI: v_mac_f32
20 ; CIVI: v_cvt_f16_f32
21 define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
22 %src0.ext = fpext half %src0 to float
23 %src1.ext = fpext half %src1 to float
24 %src2.ext = fpext half %src2 to float
25 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
26 %cvt.result = fptrunc float %result to half
27 ret half %cvt.result
28 }
29
30 ; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32:
31 ; GCN: s_waitcnt
32 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}}
33 ; GFX9-NEXT: s_setpc_b64
34
35 ; CIVI: v_mac_f32
36 define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 {
37 %src0.ext = fpext half %src0 to float
38 %src1.ext = fpext half %src1 to float
39 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
40 %cvt.result = fptrunc float %result to half
41 ret half %cvt.result
42 }
43
44 ; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
45 ; GCN: s_waitcnt
46 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}}
47 ; GFX9-NEXT: s_setpc_b64
48
49 ; CIVI: v_mac_f32_e32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]$}}
50 define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
51 %src0.ext = fpext half %src0 to float
52 %src1.ext = fpext half %src1 to float
53 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
54 %cvt.result = fptrunc float %result to half
55 %max = call half @llvm.maxnum.f16(half %cvt.result, half 0.0)
56 %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
57 ret half %clamp
58 }
59
60 ; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
61 ; GCN: s_waitcnt
62 ; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}}
63 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
64 ; GFX9-NEXT: s_setpc_b64
65
66 ; CIVI: v_mac_f32_e64 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}}
67 define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 {
68 %src0.ext = fpext half %src0 to float
69 %src1.ext = fpext half %src1 to float
70 %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2)
71 %max = call float @llvm.maxnum.f32(float %result, float 0.0)
72 %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
73 %cvt.result = fptrunc float %clamp to half
74 ret half %cvt.result
75 }
76
77 ; GCN-LABEL: {{^}}v_mad_mixlo_v2f32:
78 ; GFX9: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1]
79 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2
80 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
81 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
82 ; GFX9-NEXT: s_setpc_b64
83 define <2 x half> @v_mad_mixlo_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
84 %src0.ext = fpext <2 x half> %src0 to <2 x float>
85 %src1.ext = fpext <2 x half> %src1 to <2 x float>
86 %src2.ext = fpext <2 x half> %src2 to <2 x float>
87 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
88 %cvt.result = fptrunc <2 x float> %result to <2 x half>
89 ret <2 x half> %cvt.result
90 }
91
92 ; GCN-LABEL: {{^}}v_mad_mixlo_v3f32:
93 ; GCN: s_waitcnt
94 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6
95 ; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v4, v7
96 ; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8
97 ; GFX9-NEXT: s_setpc_b64
98 define <3 x half> @v_mad_mixlo_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
99 %src0.ext = fpext <3 x half> %src0 to <3 x float>
100 %src1.ext = fpext <3 x half> %src1 to <3 x float>
101 %src2.ext = fpext <3 x half> %src2 to <3 x float>
102 %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
103 %cvt.result = fptrunc <3 x float> %result to <3 x half>
104 ret <3 x half> %cvt.result
105 }
106
107 ; GCN-LABEL: {{^}}v_mad_mixlo_v4f32:
108 ; GCN: s_waitcnt
109 ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1]
110 ; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v2, v4
111 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
112 ; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v3, v5 op_sel:[1,1,1]
113 ; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5
114 ; GFX9-NEXT: v_and_b32_e32 v0, v2, v0
115 ; GFX9-NEXT: v_and_b32_e32 v1, v2, v1
116 ; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0
117 ; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1
118 ; GFX9-NEXT: s_setpc_b64
119 define <4 x half> @v_mad_mixlo_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
120 %src0.ext = fpext <4 x half> %src0 to <4 x float>
121 %src1.ext = fpext <4 x half> %src1 to <4 x float>
122 %src2.ext = fpext <4 x half> %src2 to <4 x float>
123 %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
124 %cvt.result = fptrunc <4 x float> %result to <4 x half>
125 ret <4 x half> %cvt.result
126 }
127
128 ; FIXME: Fold clamp
129 ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt:
130 ; GFX9: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1]
131 ; GFX9: v_mad_mixlo_f16 v0, v0, v1, v2
132 ; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]]
133 ; GFX9: v_pk_max_f16 v0, [[PACKED]], [[PACKED]] clamp{{$}}
134 ; GFX9-NEXT: s_setpc_b64
135 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
136 %src0.ext = fpext <2 x half> %src0 to <2 x float>
137 %src1.ext = fpext <2 x half> %src1 to <2 x float>
138 %src2.ext = fpext <2 x half> %src2 to <2 x float>
139 %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext)
140 %cvt.result = fptrunc <2 x float> %result to <2 x half>
141 %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %cvt.result, <2 x half> zeroinitializer)
142 %clamp = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> )
143 ret <2 x half> %clamp
144 }
145
146 declare half @llvm.minnum.f16(half, half) #1
147 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1
148
149 declare half @llvm.maxnum.f16(half, half) #1
150 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1
151
152 declare float @llvm.minnum.f32(float, float) #1
153 declare float @llvm.maxnum.f32(float, float) #1
154 declare float @llvm.fmuladd.f32(float, float, float) #1
155 declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
156 declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1
157 declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
158
159 attributes #0 = { nounwind }
160 attributes #1 = { nounwind readnone speculatable }