llvm.org GIT mirror llvm / 572b727
AMDGPU: Try to use op_sel when selecting packed instructions Avoids instructions to pack a vector when the source is really a scalar being broadcast. Also be smarter and look for per-component fneg. Doesn't yet handle scalar from upper half of register or other swizzles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303291 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 3 years ago
3 changed file(s) with 298 addition(s) and 4 deletion(s). Raw diff Collapse all Expand all
17061706
17071707 // FIXME: Look for on separate components
17081708 if (Src.getOpcode() == ISD::FNEG) {
1709 Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI);
1709 Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
17101710 Src = Src.getOperand(0);
1711 }
1712
1713 if (Src.getOpcode() == ISD::BUILD_VECTOR) {
1714 unsigned VecMods = Mods;
1715
1716 SDValue Lo = Src.getOperand(0);
1717 SDValue Hi = Src.getOperand(1);
1718
1719 if (Lo.getOpcode() == ISD::FNEG) {
1720 Lo = Lo.getOperand(0);
1721 Mods ^= SISrcMods::NEG;
1722 }
1723
1724 if (Hi.getOpcode() == ISD::FNEG) {
1725 Hi = Hi.getOperand(0);
1726 Mods ^= SISrcMods::NEG_HI;
1727 }
1728
1729 if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
1730 // Really a scalar input. Just select from the low half of the register to
1731 // avoid packing.
1732
1733 Src = Lo;
1734 SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
1735 return true;
1736 }
1737
1738 Mods = VecMods;
17111739 }
17121740
17131741 // Packed instructions do not have abs modifiers.
287287 }
288288
289289 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
290 ; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
291 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400
292 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]]
290 ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
291 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
292 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] op_sel_hi:[0,1]{{$}}
293293 ; GFX9: buffer_store_dword [[REG]]
294294
295295 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
0 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
1
2 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
3 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
4 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
5 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
6
7 ; GCN-NOT: pack
8 ; GCN-NOT: and
9 ; GCN-NOT: shl
10 ; GCN-NOT: or
11
12 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
13 define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
14 bb:
15 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
16
17 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
18 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
19 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
20
21 %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
22 %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
23
24 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
25 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
26 ret void
27 }
28
29 ; Apply fneg to broadcasted vector
30 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
31 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
32 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
33 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
34
35 ; GCN-NOT: pack
36 ; GCN-NOT: and
37 ; GCN-NOT: shl
38 ; GCN-NOT: or
39
40 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
41 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
42 bb:
43 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
44
45 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
46 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
47 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
48
49 %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
50 %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
51 %neg.scalar0.broadcast = fsub <2 x half> , %scalar0.broadcast
52
53 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
54 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
55 ret void
56 }
57
58 ; Apply fneg before broadcast
59 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
60 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
61 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
62 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
63
64 ; GCN-NOT: pack
65 ; GCN-NOT: and
66 ; GCN-NOT: shl
67 ; GCN-NOT: or
68
69 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
70 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
71 bb:
72 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
73
74 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
75 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
76 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
77
78 %neg.scalar0 = fsub half -0.0, %scalar0
79 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
80 %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
81
82 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
83 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
84 ret void
85 }
86
87 ; Apply fneg before and after broadcast, and should cancel out.
88 ; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
89 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
90 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
91 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
92
93 ; GCN-NOT: pack
94 ; GCN-NOT: and
95 ; GCN-NOT: shl
96 ; GCN-NOT: or
97
98 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
99 define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
100 bb:
101 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
102
103 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
104 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
105 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
106
107 %neg.scalar0 = fsub half -0.0, %scalar0
108 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
109 %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
110 %neg.neg.scalar0.broadcast = fsub <2 x half> , %neg.scalar0.broadcast
111
112 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
113 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
114 ret void
115 }
116
117 ; Add scalar, but negate low component
118 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
119 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
120 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
121 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
122
123 ; GCN-NOT: pack
124 ; GCN-NOT: and
125 ; GCN-NOT: shl
126 ; GCN-NOT: or
127
128 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
129 define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
130 bb:
131 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
132
133 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
134 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
135 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
136
137 %neg.scalar0 = fsub half -0.0, %scalar0
138 %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
139 %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
140 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
141 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
142 ret void
143 }
144
145 ; Add scalar, but negate high component
146 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
147 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
148 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
149 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
150
151 ; GCN-NOT: pack
152 ; GCN-NOT: and
153 ; GCN-NOT: shl
154 ; GCN-NOT: or
155
156 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
157 define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
158 bb:
159 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
160
161 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
162 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
163 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
164
165 %neg.scalar0 = fsub half -0.0, %scalar0
166 %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
167 %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
168 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
169 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
170 ret void
171 }
172
173 ; Apply fneg before broadcast with bitcast
174 ; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
175 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
176 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
177
178 ; GCN-NOT: pack
179 ; GCN-NOT: and
180 ; GCN-NOT: shl
181 ; GCN-NOT: or
182
183 ; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]]
184 ; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}}
185 define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
186 bb:
187 %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
188 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
189 %neg.scalar0 = fsub half -0.0, %scalar0
190 %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
191
192 %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
193 %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
194
195 %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
196 store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
197 ret void
198 }
199
200 ; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
201 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
202 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
203 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
204 ; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
205
206 ; FIXME: Remove and
207 ; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
208 ; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
209 ; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
210
211 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
212 define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
213 bb:
214 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
215 %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
216
217 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
218 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
219
220 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
221 %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
222
223 %neg.scalar1 = fsub half -0.0, %scalar1
224 %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
225 %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
226 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
227 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
228 ret void
229 }
230
231 ; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
232 ; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
233 ; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
234 ; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
235 ; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
236
237 ; FIXME: Remove and
238 ; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
239 ; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
240
241 ; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
242 define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
243 bb:
244 %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
245 %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
246
247 %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
248 %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
249
250 %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
251 %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
252
253 %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
254 %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
255 %neg.vec2 = fsub <2 x half> , %vec2
256
257 %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
258 store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
259 ret void
260 }
261
262 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
263
264 attributes #0 = { nounwind }
265 attributes #1 = { nounwind readnone }