LLVM 19.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47}
48
51}
52
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57}
58
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Always lower memset, memcpy, and memmove intrinsics to load/store
63 // instructions, rather then generating calls to memset, mempcy or memmove.
67
68 // Lower floating point store/load to integer store/load to reduce the number
69 // of patterns in tablegen.
71 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
72
74 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
75
77 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
78
80 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
81
83 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
84
86 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
87
89 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
90
92 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
93
95 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
96
98 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
99
100 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
102
103 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
105
106 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
107 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
108
109 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
110 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
111
113 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
114
116 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
117
119 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
120
122 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
123
125 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
126
128 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
129
131 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
132
134 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
135
137 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
138
140 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
141
142 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
143 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
144
145 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
146 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
147
149 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
150
151 // TODO: Would be better to consume as directly legal
153 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
154
156 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
157
159 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
160
162 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
163
165 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
166
168 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
169
171 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
172
174 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
175
176 // There are no 64-bit extloads. These should be done as a 32-bit extload and
177 // an extension to 64-bit.
178 for (MVT VT : MVT::integer_valuetypes())
180 Expand);
181
182 for (MVT VT : MVT::integer_valuetypes()) {
183 if (VT == MVT::i64)
184 continue;
185
186 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
187 setLoadExtAction(Op, VT, MVT::i1, Promote);
188 setLoadExtAction(Op, VT, MVT::i8, Legal);
189 setLoadExtAction(Op, VT, MVT::i16, Legal);
190 setLoadExtAction(Op, VT, MVT::i32, Expand);
191 }
192 }
193
195 for (auto MemVT :
196 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
198 Expand);
199
200 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
201 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
202 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
203 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
204 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
205 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
214
215 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
234
236 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
237
239 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
240
242 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
243
245 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
246
248 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
249
251 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
252
254 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
255
257 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
258
260 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
261
263 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
264
266 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
267
269 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
270
272 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
273
275 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
276
278 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
279
281 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
282
284 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
285
287 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
288
290 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
291
293 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
294
296 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
297
299 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
300
302 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
303
305 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
306
308 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
309
311 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
312
314 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
315
316 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
317 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
318 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
319 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
320
321 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
322 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
323 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
324 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
325
326 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
327 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
328 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
329 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
330 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
331 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
332 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
333 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
334 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
335 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
336 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
337 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
338 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
339 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
340
341 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
342 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
343 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
344
345 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
346 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
347 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
348
349 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
350
351 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
352 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
353 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
354 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
355 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
356 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
357 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
358
359 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
360 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
361 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
362 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
363 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
364
365 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
366 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
367 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
368
369 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
370 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
371 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
372 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
373 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
374 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
375 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
376 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
377
378 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
379 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
380
382
383 // For R600, this is totally unsupported, just custom lower to produce an
384 // error.
386
387 // Library functions. These default to Expand, but we have instructions
388 // for them.
391 MVT::f32, Legal);
392
394 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
395
398 Custom);
399
400 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
401
402 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
403
404 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
405
406 if (Subtarget->has16BitInsts())
407 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
408 else {
409 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
411 }
412
414 Custom);
415
416 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
417 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
418 // default unless marked custom/legal.
421 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
422 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
423 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
424 Custom);
425
426 // Expand to fneg + fadd.
428
430 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
431 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
432 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
433 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
434 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
435 Custom);
436
437 // FIXME: Why is v8f16/v8bf16 missing?
440 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
441 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
442 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
443 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
444 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
445 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
446 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
447 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
448 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
449 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
450 Custom);
451
453 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
454
455 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
456 for (MVT VT : ScalarIntVTs) {
457 // These should use [SU]DIVREM, so set them to expand
459 Expand);
460
461 // GPU does not have divrem function for signed or unsigned.
463
464 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
466
468
469 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
471 }
472
473 // The hardware supports 32-bit FSHR, but not FSHL.
475
476 // The hardware supports 32-bit ROTR, but not ROTL.
477 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
479
481
485 MVT::i64, Custom);
487
489 Legal);
490
493 MVT::i64, Custom);
494
495 for (auto VT : {MVT::i8, MVT::i16})
497
498 static const MVT::SimpleValueType VectorIntTypes[] = {
499 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
500 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
501
502 for (MVT VT : VectorIntTypes) {
503 // Expand the following operations for the current type by default.
515 ISD::SETCC},
516 VT, Expand);
517 }
518
519 static const MVT::SimpleValueType FloatVectorTypes[] = {
520 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
521 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
522
523 for (MVT VT : FloatVectorTypes) {
536 VT, Expand);
537 }
538
539 // This causes using an unrolled select operation rather than expansion with
540 // bit operations. This is in general better, but the alternative using BFI
541 // instructions may be better if the select sources are SGPRs.
543 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
544
546 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
547
549 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
550
552 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
553
555 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
556
558 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
559
561 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
562
564 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
565
567 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
568
570 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
571
572 // Disable most libcalls.
573 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
574 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
575 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
576 }
577
579 setJumpIsExpensive(true);
580
581 // FIXME: This is only partially true. If we have to do vector compares, any
582 // SGPR pair can be a condition register. If we have a uniform condition, we
583 // are better off doing SALU operations, where there is only one SCC. For now,
584 // we don't have a way of knowing during instruction selection if a condition
585 // will be uniform and we always use vector compares. Assume we are using
586 // vector compares until that is fixed.
588
591
593
594 // We want to find all load dependencies for long chains of stores to enable
595 // merging into very wide vectors. The problem is with vectors with > 4
596 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
597 // vectors are a legal type, even though we have to split the loads
598 // usually. When we can more precisely specify load legality per address
599 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
600 // smarter so that they can figure out what to do in 2 iterations without all
601 // N > 4 stores on the same chain.
603
604 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
605 // about these during lowering.
606 MaxStoresPerMemcpy = 0xffffffff;
607 MaxStoresPerMemmove = 0xffffffff;
608 MaxStoresPerMemset = 0xffffffff;
609
610 // The expansion for 64-bit division is enormous.
612 addBypassSlowDiv(64, 32);
613
624
628}
629
631 if (getTargetMachine().Options.NoSignedZerosFPMath)
632 return true;
633
634 const auto Flags = Op.getNode()->getFlags();
635 if (Flags.hasNoSignedZeros())
636 return true;
637
638 return false;
639}
640
641//===----------------------------------------------------------------------===//
642// Target Information
643//===----------------------------------------------------------------------===//
644
646static bool fnegFoldsIntoOpcode(unsigned Opc) {
647 switch (Opc) {
648 case ISD::FADD:
649 case ISD::FSUB:
650 case ISD::FMUL:
651 case ISD::FMA:
652 case ISD::FMAD:
653 case ISD::FMINNUM:
654 case ISD::FMAXNUM:
657 case ISD::FMINIMUM:
658 case ISD::FMAXIMUM:
659 case ISD::SELECT:
660 case ISD::FSIN:
661 case ISD::FTRUNC:
662 case ISD::FRINT:
663 case ISD::FNEARBYINT:
664 case ISD::FROUNDEVEN:
666 case AMDGPUISD::RCP:
673 case AMDGPUISD::FMED3:
674 // TODO: handle llvm.amdgcn.fma.legacy
675 return true;
676 case ISD::BITCAST:
677 llvm_unreachable("bitcast is special cased");
678 default:
679 return false;
680 }
681}
682
683static bool fnegFoldsIntoOp(const SDNode *N) {
684 unsigned Opc = N->getOpcode();
685 if (Opc == ISD::BITCAST) {
686 // TODO: Is there a benefit to checking the conditions performFNegCombine
687 // does? We don't for the other cases.
688 SDValue BCSrc = N->getOperand(0);
689 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
690 return BCSrc.getNumOperands() == 2 &&
691 BCSrc.getOperand(1).getValueSizeInBits() == 32;
692 }
693
694 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
695 }
696
697 return fnegFoldsIntoOpcode(Opc);
698}
699
700/// \p returns true if the operation will definitely need to use a 64-bit
701/// encoding, and thus will use a VOP3 encoding regardless of the source
702/// modifiers.
704static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
705 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
706 VT == MVT::f64;
707}
708
709/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
710/// type for ISD::SELECT.
712static bool selectSupportsSourceMods(const SDNode *N) {
713 // TODO: Only applies if select will be vector
714 return N->getValueType(0) == MVT::f32;
715}
716
717// Most FP instructions support source modifiers, but this could be refined
718// slightly.
720static bool hasSourceMods(const SDNode *N) {
721 if (isa<MemSDNode>(N))
722 return false;
723
724 switch (N->getOpcode()) {
725 case ISD::CopyToReg:
726 case ISD::FDIV:
727 case ISD::FREM:
728 case ISD::INLINEASM:
732
733 // TODO: Should really be looking at the users of the bitcast. These are
734 // problematic because bitcasts are used to legalize all stores to integer
735 // types.
736 case ISD::BITCAST:
737 return false;
739 switch (N->getConstantOperandVal(0)) {
740 case Intrinsic::amdgcn_interp_p1:
741 case Intrinsic::amdgcn_interp_p2:
742 case Intrinsic::amdgcn_interp_mov:
743 case Intrinsic::amdgcn_interp_p1_f16:
744 case Intrinsic::amdgcn_interp_p2_f16:
745 return false;
746 default:
747 return true;
748 }
749 }
750 case ISD::SELECT:
752 default:
753 return true;
754 }
755}
756
758 unsigned CostThreshold) {
759 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
760 // it is truly free to use a source modifier in all cases. If there are
761 // multiple users but for each one will necessitate using VOP3, there will be
762 // a code size increase. Try to avoid increasing code size unless we know it
763 // will save on the instruction count.
764 unsigned NumMayIncreaseSize = 0;
765 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
766
767 assert(!N->use_empty());
768
769 // XXX - Should this limit number of uses to check?
770 for (const SDNode *U : N->uses()) {
771 if (!hasSourceMods(U))
772 return false;
773
774 if (!opMustUseVOP3Encoding(U, VT)) {
775 if (++NumMayIncreaseSize > CostThreshold)
776 return false;
777 }
778 }
779
780 return true;
781}
782
784 ISD::NodeType ExtendKind) const {
785 assert(!VT.isVector() && "only scalar expected");
786
787 // Round to the next multiple of 32-bits.
788 unsigned Size = VT.getSizeInBits();
789 if (Size <= 32)
790 return MVT::i32;
791 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
792}
793
795 return MVT::i32;
796}
797
799 return true;
800}
801
802// The backend supports 32 and 64 bit floating point immediates.
803// FIXME: Why are we reporting vectors of FP immediates as legal?
805 bool ForCodeSize) const {
806 EVT ScalarVT = VT.getScalarType();
807 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
808 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
809}
810
811// We don't want to shrink f64 / f32 constants.
813 EVT ScalarVT = VT.getScalarType();
814 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
815}
816
818 ISD::LoadExtType ExtTy,
819 EVT NewVT) const {
820 // TODO: This may be worth removing. Check regression tests for diffs.
822 return false;
823
824 unsigned NewSize = NewVT.getStoreSizeInBits();
825
826 // If we are reducing to a 32-bit load or a smaller multi-dword load,
827 // this is always better.
828 if (NewSize >= 32)
829 return true;
830
831 EVT OldVT = N->getValueType(0);
832 unsigned OldSize = OldVT.getStoreSizeInBits();
833
834 MemSDNode *MN = cast<MemSDNode>(N);
835 unsigned AS = MN->getAddressSpace();
836 // Do not shrink an aligned scalar load to sub-dword.
837 // Scalar engine cannot do sub-dword loads.
838 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
839 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
842 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
843 MN->isInvariant())) &&
845 return false;
846
847 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
848 // extloads, so doing one requires using a buffer_load. In cases where we
849 // still couldn't use a scalar load, using the wider load shouldn't really
850 // hurt anything.
851
852 // If the old size already had to be an extload, there's no harm in continuing
853 // to reduce the width.
854 return (OldSize < 32);
855}
856
858 const SelectionDAG &DAG,
859 const MachineMemOperand &MMO) const {
860
861 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
862
863 if (LoadTy.getScalarType() == MVT::i32)
864 return false;
865
866 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
867 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
868
869 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
870 return false;
871
872 unsigned Fast = 0;
874 CastTy, MMO, &Fast) &&
875 Fast;
876}
877
878// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
879// profitable with the expansion for 64-bit since it's generally good to
880// speculate things.
882 return true;
883}
884
886 return true;
887}
888
890 switch (N->getOpcode()) {
891 case ISD::EntryToken:
892 case ISD::TokenFactor:
893 return true;
895 unsigned IntrID = N->getConstantOperandVal(0);
897 }
898 case ISD::LOAD:
899 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
901 return true;
902 return false;
903 case AMDGPUISD::SETCC: // ballot-style instruction
904 return true;
905 }
906 return false;
907}
908
910 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
911 NegatibleCost &Cost, unsigned Depth) const {
912
913 switch (Op.getOpcode()) {
914 case ISD::FMA:
915 case ISD::FMAD: {
916 // Negating a fma is not free if it has users without source mods.
917 if (!allUsesHaveSourceMods(Op.getNode()))
918 return SDValue();
919 break;
920 }
921 case AMDGPUISD::RCP: {
922 SDValue Src = Op.getOperand(0);
923 EVT VT = Op.getValueType();
924 SDLoc SL(Op);
925
926 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
927 ForCodeSize, Cost, Depth + 1);
928 if (NegSrc)
929 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
930 return SDValue();
931 }
932 default:
933 break;
934 }
935
936 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
937 ForCodeSize, Cost, Depth);
938}
939
940//===---------------------------------------------------------------------===//
941// Target Properties
942//===---------------------------------------------------------------------===//
943
946
947 // Packed operations do not have a fabs modifier.
948 return VT == MVT::f32 || VT == MVT::f64 ||
949 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
950}
951
954 // Report this based on the end legalized type.
955 VT = VT.getScalarType();
956 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
957}
958
960 unsigned NumElem,
961 unsigned AS) const {
962 return true;
963}
964
966 // There are few operations which truly have vector input operands. Any vector
967 // operation is going to involve operations on each component, and a
968 // build_vector will be a copy per element, so it always makes sense to use a
969 // build_vector input in place of the extracted element to avoid a copy into a
970 // super register.
971 //
972 // We should probably only do this if all users are extracts only, but this
973 // should be the common case.
974 return true;
975}
976
978 // Truncate is just accessing a subregister.
979
980 unsigned SrcSize = Source.getSizeInBits();
981 unsigned DestSize = Dest.getSizeInBits();
982
983 return DestSize < SrcSize && DestSize % 32 == 0 ;
984}
985
987 // Truncate is just accessing a subregister.
988
989 unsigned SrcSize = Source->getScalarSizeInBits();
990 unsigned DestSize = Dest->getScalarSizeInBits();
991
992 if (DestSize== 16 && Subtarget->has16BitInsts())
993 return SrcSize >= 32;
994
995 return DestSize < SrcSize && DestSize % 32 == 0;
996}
997
999 unsigned SrcSize = Src->getScalarSizeInBits();
1000 unsigned DestSize = Dest->getScalarSizeInBits();
1001
1002 if (SrcSize == 16 && Subtarget->has16BitInsts())
1003 return DestSize >= 32;
1004
1005 return SrcSize == 32 && DestSize == 64;
1006}
1007
1009 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1010 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1011 // this will enable reducing 64-bit operations the 32-bit, which is always
1012 // good.
1013
1014 if (Src == MVT::i16)
1015 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1016
1017 return Src == MVT::i32 && Dest == MVT::i64;
1018}
1019
1021 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1022 // limited number of native 64-bit operations. Shrinking an operation to fit
1023 // in a single 32-bit register should always be helpful. As currently used,
1024 // this is much less general than the name suggests, and is only used in
1025 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1026 // not profitable, and may actually be harmful.
1027 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1028}
1029
1031 const SDNode* N, CombineLevel Level) const {
1032 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1033 N->getOpcode() == ISD::SRL) &&
1034 "Expected shift op");
1035 // Always commute pre-type legalization and right shifts.
1036 // We're looking for shl(or(x,y),z) patterns.
1038 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1039 return true;
1040
1041 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1042 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1043 (N->use_begin()->getOpcode() == ISD::SRA ||
1044 N->use_begin()->getOpcode() == ISD::SRL))
1045 return false;
1046
1047 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1048 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1049 if (LHS.getOpcode() != ISD::SHL)
1050 return false;
1051 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1052 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1053 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1054 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1055 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1056 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1057 };
1058 SDValue LHS = N->getOperand(0).getOperand(0);
1059 SDValue RHS = N->getOperand(0).getOperand(1);
1060 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1061}
1062
1063//===---------------------------------------------------------------------===//
1064// TargetLowering Callbacks
1065//===---------------------------------------------------------------------===//
1066
1068 bool IsVarArg) {
1069 switch (CC) {
1077 return CC_AMDGPU;
1080 return CC_AMDGPU_CS_CHAIN;
1081 case CallingConv::C:
1082 case CallingConv::Fast:
1083 case CallingConv::Cold:
1084 return CC_AMDGPU_Func;
1086 return CC_SI_Gfx;
1089 default:
1090 report_fatal_error("Unsupported calling convention for call");
1091 }
1092}
1093
1095 bool IsVarArg) {
1096 switch (CC) {
1099 llvm_unreachable("kernels should not be handled here");
1109 return RetCC_SI_Shader;
1111 return RetCC_SI_Gfx;
1112 case CallingConv::C:
1113 case CallingConv::Fast:
1114 case CallingConv::Cold:
1115 return RetCC_AMDGPU_Func;
1116 default:
1117 report_fatal_error("Unsupported calling convention.");
1118 }
1119}
1120
1121/// The SelectionDAGBuilder will automatically promote function arguments
1122/// with illegal types. However, this does not work for the AMDGPU targets
1123/// since the function arguments are stored in memory as these illegal types.
1124/// In order to handle this properly we need to get the original types sizes
1125/// from the LLVM IR Function and fixup the ISD:InputArg values before
1126/// passing them to AnalyzeFormalArguments()
1127
1128/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1129/// input values across multiple registers. Each item in the Ins array
1130/// represents a single value that will be stored in registers. Ins[x].VT is
1131/// the value type of the value that will be stored in the register, so
1132/// whatever SDNode we lower the argument to needs to be this type.
1133///
1134/// In order to correctly lower the arguments we need to know the size of each
1135/// argument. Since Ins[x].VT gives us the size of the register that will
1136/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1137/// for the original function argument so that we can deduce the correct memory
1138/// type to use for Ins[x]. In most cases the correct memory type will be
1139/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1140/// we have a kernel argument of type v8i8, this argument will be split into
1141/// 8 parts and each part will be represented by its own item in the Ins array.
1142/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1143/// the argument before it was split. From this, we deduce that the memory type
1144/// for each individual part is i8. We pass the memory type as LocVT to the
1145/// calling convention analysis function and the register type (Ins[x].VT) as
1146/// the ValVT.
1148 CCState &State,
1149 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1150 const MachineFunction &MF = State.getMachineFunction();
1151 const Function &Fn = MF.getFunction();
1152 LLVMContext &Ctx = Fn.getParent()->getContext();
1153 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1154 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1156
1157 Align MaxAlign = Align(1);
1158 uint64_t ExplicitArgOffset = 0;
1159 const DataLayout &DL = Fn.getParent()->getDataLayout();
1160
1161 unsigned InIndex = 0;
1162
1163 for (const Argument &Arg : Fn.args()) {
1164 const bool IsByRef = Arg.hasByRefAttr();
1165 Type *BaseArgTy = Arg.getType();
1166 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1167 Align Alignment = DL.getValueOrABITypeAlignment(
1168 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1169 MaxAlign = std::max(Alignment, MaxAlign);
1170 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1171
1172 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1173 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1174
1175 // We're basically throwing away everything passed into us and starting over
1176 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1177 // to us as computed in Ins.
1178 //
1179 // We also need to figure out what type legalization is trying to do to get
1180 // the correct memory offsets.
1181
1182 SmallVector<EVT, 16> ValueVTs;
1184 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1185
1186 for (unsigned Value = 0, NumValues = ValueVTs.size();
1187 Value != NumValues; ++Value) {
1188 uint64_t BasePartOffset = Offsets[Value];
1189
1190 EVT ArgVT = ValueVTs[Value];
1191 EVT MemVT = ArgVT;
1192 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1193 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1194
1195 if (NumRegs == 1) {
1196 // This argument is not split, so the IR type is the memory type.
1197 if (ArgVT.isExtended()) {
1198 // We have an extended type, like i24, so we should just use the
1199 // register type.
1200 MemVT = RegisterVT;
1201 } else {
1202 MemVT = ArgVT;
1203 }
1204 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1205 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1206 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1207 // We have a vector value which has been split into a vector with
1208 // the same scalar type, but fewer elements. This should handle
1209 // all the floating-point vector types.
1210 MemVT = RegisterVT;
1211 } else if (ArgVT.isVector() &&
1212 ArgVT.getVectorNumElements() == NumRegs) {
1213 // This arg has been split so that each element is stored in a separate
1214 // register.
1215 MemVT = ArgVT.getScalarType();
1216 } else if (ArgVT.isExtended()) {
1217 // We have an extended type, like i65.
1218 MemVT = RegisterVT;
1219 } else {
1220 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1221 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1222 if (RegisterVT.isInteger()) {
1223 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1224 } else if (RegisterVT.isVector()) {
1225 assert(!RegisterVT.getScalarType().isFloatingPoint());
1226 unsigned NumElements = RegisterVT.getVectorNumElements();
1227 assert(MemoryBits % NumElements == 0);
1228 // This vector type has been split into another vector type with
1229 // a different elements size.
1230 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1231 MemoryBits / NumElements);
1232 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1233 } else {
1234 llvm_unreachable("cannot deduce memory type.");
1235 }
1236 }
1237
1238 // Convert one element vectors to scalar.
1239 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1240 MemVT = MemVT.getScalarType();
1241
1242 // Round up vec3/vec5 argument.
1243 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1244 assert(MemVT.getVectorNumElements() == 3 ||
1245 MemVT.getVectorNumElements() == 5 ||
1246 (MemVT.getVectorNumElements() >= 9 &&
1247 MemVT.getVectorNumElements() <= 12));
1248 MemVT = MemVT.getPow2VectorType(State.getContext());
1249 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1250 MemVT = MemVT.getRoundIntegerType(State.getContext());
1251 }
1252
1253 unsigned PartOffset = 0;
1254 for (unsigned i = 0; i != NumRegs; ++i) {
1255 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1256 BasePartOffset + PartOffset,
1257 MemVT.getSimpleVT(),
1259 PartOffset += MemVT.getStoreSize();
1260 }
1261 }
1262 }
1263}
1264
1266 SDValue Chain, CallingConv::ID CallConv,
1267 bool isVarArg,
1269 const SmallVectorImpl<SDValue> &OutVals,
1270 const SDLoc &DL, SelectionDAG &DAG) const {
1271 // FIXME: Fails for r600 tests
1272 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1273 // "wave terminate should not have return values");
1274 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1275}
1276
1277//===---------------------------------------------------------------------===//
1278// Target specific lowering
1279//===---------------------------------------------------------------------===//
1280
1281/// Selects the correct CCAssignFn for a given CallingConvention value.
1283 bool IsVarArg) {
1284 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1285}
1286
1288 bool IsVarArg) {
1290}
1291
1293 SelectionDAG &DAG,
1294 MachineFrameInfo &MFI,
1295 int ClobberedFI) const {
1296 SmallVector<SDValue, 8> ArgChains;
1297 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1298 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1299
1300 // Include the original chain at the beginning of the list. When this is
1301 // used by target LowerCall hooks, this helps legalize find the
1302 // CALLSEQ_BEGIN node.
1303 ArgChains.push_back(Chain);
1304
1305 // Add a chain value for each stack argument corresponding
1306 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1307 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1308 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1309 if (FI->getIndex() < 0) {
1310 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1311 int64_t InLastByte = InFirstByte;
1312 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1313
1314 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1315 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1316 ArgChains.push_back(SDValue(L, 1));
1317 }
1318 }
1319 }
1320 }
1321
1322 // Build a tokenfactor for all the chains.
1323 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1324}
1325
1328 StringRef Reason) const {
1329 SDValue Callee = CLI.Callee;
1330 SelectionDAG &DAG = CLI.DAG;
1331
1332 const Function &Fn = DAG.getMachineFunction().getFunction();
1333
1334 StringRef FuncName("<unknown>");
1335
1336 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1337 FuncName = G->getSymbol();
1338 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1339 FuncName = G->getGlobal()->getName();
1340
1342 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1343 DAG.getContext()->diagnose(NoCalls);
1344
1345 if (!CLI.IsTailCall) {
1346 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1347 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1348 }
1349
1350 return DAG.getEntryNode();
1351}
1352
1354 SmallVectorImpl<SDValue> &InVals) const {
1355 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1356}
1357
1359 SelectionDAG &DAG) const {
1360 const Function &Fn = DAG.getMachineFunction().getFunction();
1361
1362 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1363 SDLoc(Op).getDebugLoc());
1364 DAG.getContext()->diagnose(NoDynamicAlloca);
1365 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1366 return DAG.getMergeValues(Ops, SDLoc());
1367}
1368
1370 SelectionDAG &DAG) const {
1371 switch (Op.getOpcode()) {
1372 default:
1373 Op->print(errs(), &DAG);
1374 llvm_unreachable("Custom lowering code for this "
1375 "instruction is not implemented yet!");
1376 break;
1378 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1380 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1381 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1382 case ISD::FREM: return LowerFREM(Op, DAG);
1383 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1384 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1385 case ISD::FRINT: return LowerFRINT(Op, DAG);
1386 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1387 case ISD::FROUNDEVEN:
1388 return LowerFROUNDEVEN(Op, DAG);
1389 case ISD::FROUND: return LowerFROUND(Op, DAG);
1390 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1391 case ISD::FLOG2:
1392 return LowerFLOG2(Op, DAG);
1393 case ISD::FLOG:
1394 case ISD::FLOG10:
1395 return LowerFLOGCommon(Op, DAG);
1396 case ISD::FEXP:
1397 case ISD::FEXP10:
1398 return lowerFEXP(Op, DAG);
1399 case ISD::FEXP2:
1400 return lowerFEXP2(Op, DAG);
1401 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1402 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1403 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1404 case ISD::FP_TO_SINT:
1405 case ISD::FP_TO_UINT:
1406 return LowerFP_TO_INT(Op, DAG);
1407 case ISD::CTTZ:
1409 case ISD::CTLZ:
1411 return LowerCTLZ_CTTZ(Op, DAG);
1413 }
1414 return Op;
1415}
1416
1419 SelectionDAG &DAG) const {
1420 switch (N->getOpcode()) {
1422 // Different parts of legalization seem to interpret which type of
1423 // sign_extend_inreg is the one to check for custom lowering. The extended
1424 // from type is what really matters, but some places check for custom
1425 // lowering of the result type. This results in trying to use
1426 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1427 // nothing here and let the illegal result integer be handled normally.
1428 return;
1429 case ISD::FLOG2:
1430 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1431 Results.push_back(Lowered);
1432 return;
1433 case ISD::FLOG:
1434 case ISD::FLOG10:
1435 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1436 Results.push_back(Lowered);
1437 return;
1438 case ISD::FEXP2:
1439 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1440 Results.push_back(Lowered);
1441 return;
1442 case ISD::FEXP:
1443 case ISD::FEXP10:
1444 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1445 Results.push_back(Lowered);
1446 return;
1447 case ISD::CTLZ:
1449 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1450 Results.push_back(Lowered);
1451 return;
1452 default:
1453 return;
1454 }
1455}
1456
1458 SDValue Op,
1459 SelectionDAG &DAG) const {
1460
1461 const DataLayout &DL = DAG.getDataLayout();
1462 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1463 const GlobalValue *GV = G->getGlobal();
1464
1465 if (!MFI->isModuleEntryFunction()) {
1466 if (std::optional<uint32_t> Address =
1468 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1469 }
1470 }
1471
1472 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1473 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1474 if (!MFI->isModuleEntryFunction() &&
1475 GV->getName() != "llvm.amdgcn.module.lds") {
1476 SDLoc DL(Op);
1477 const Function &Fn = DAG.getMachineFunction().getFunction();
1478 DiagnosticInfoUnsupported BadLDSDecl(
1479 Fn, "local memory global used by non-kernel function",
1480 DL.getDebugLoc(), DS_Warning);
1481 DAG.getContext()->diagnose(BadLDSDecl);
1482
1483 // We currently don't have a way to correctly allocate LDS objects that
1484 // aren't directly associated with a kernel. We do force inlining of
1485 // functions that use local objects. However, if these dead functions are
1486 // not eliminated, we don't want a compile time error. Just emit a warning
1487 // and a trap, since there should be no callable path here.
1488 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1489 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1490 Trap, DAG.getRoot());
1491 DAG.setRoot(OutputChain);
1492 return DAG.getUNDEF(Op.getValueType());
1493 }
1494
1495 // XXX: What does the value of G->getOffset() mean?
1496 assert(G->getOffset() == 0 &&
1497 "Do not know what to do with an non-zero offset");
1498
1499 // TODO: We could emit code to handle the initialization somewhere.
1500 // We ignore the initializer for now and legalize it to allow selection.
1501 // The initializer will anyway get errored out during assembly emission.
1502 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1503 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1504 }
1505 return SDValue();
1506}
1507
1509 SelectionDAG &DAG) const {
1511 SDLoc SL(Op);
1512
1513 EVT VT = Op.getValueType();
1514 if (VT.getVectorElementType().getSizeInBits() < 32) {
1515 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1516 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1517 unsigned NewNumElt = OpBitSize / 32;
1518 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1520 MVT::i32, NewNumElt);
1521 for (const SDUse &U : Op->ops()) {
1522 SDValue In = U.get();
1523 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1524 if (NewNumElt > 1)
1525 DAG.ExtractVectorElements(NewIn, Args);
1526 else
1527 Args.push_back(NewIn);
1528 }
1529
1530 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1531 NewNumElt * Op.getNumOperands());
1532 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1533 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1534 }
1535 }
1536
1537 for (const SDUse &U : Op->ops())
1538 DAG.ExtractVectorElements(U.get(), Args);
1539
1540 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1541}
1542
1544 SelectionDAG &DAG) const {
1545 SDLoc SL(Op);
1547 unsigned Start = Op.getConstantOperandVal(1);
1548 EVT VT = Op.getValueType();
1549 EVT SrcVT = Op.getOperand(0).getValueType();
1550
1551 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1552 unsigned NumElt = VT.getVectorNumElements();
1553 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1554 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1555
1556 // Extract 32-bit registers at a time.
1557 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1558 EVT NewVT = NumElt == 2
1559 ? MVT::i32
1560 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1561 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1562
1563 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1564 if (NumElt == 2)
1565 Tmp = Args[0];
1566 else
1567 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1568
1569 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1570 }
1571
1572 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1574
1575 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1576}
1577
1578// TODO: Handle fabs too
1580 if (Val.getOpcode() == ISD::FNEG)
1581 return Val.getOperand(0);
1582
1583 return Val;
1584}
1585
1587 if (Val.getOpcode() == ISD::FNEG)
1588 Val = Val.getOperand(0);
1589 if (Val.getOpcode() == ISD::FABS)
1590 Val = Val.getOperand(0);
1591 if (Val.getOpcode() == ISD::FCOPYSIGN)
1592 Val = Val.getOperand(0);
1593 return Val;
1594}
1595
1597 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1598 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1599 SelectionDAG &DAG = DCI.DAG;
1600 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1601 switch (CCOpcode) {
1602 case ISD::SETOEQ:
1603 case ISD::SETONE:
1604 case ISD::SETUNE:
1605 case ISD::SETNE:
1606 case ISD::SETUEQ:
1607 case ISD::SETEQ:
1608 case ISD::SETFALSE:
1609 case ISD::SETFALSE2:
1610 case ISD::SETTRUE:
1611 case ISD::SETTRUE2:
1612 case ISD::SETUO:
1613 case ISD::SETO:
1614 break;
1615 case ISD::SETULE:
1616 case ISD::SETULT: {
1617 if (LHS == True)
1618 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1619 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1620 }
1621 case ISD::SETOLE:
1622 case ISD::SETOLT:
1623 case ISD::SETLE:
1624 case ISD::SETLT: {
1625 // Ordered. Assume ordered for undefined.
1626
1627 // Only do this after legalization to avoid interfering with other combines
1628 // which might occur.
1630 !DCI.isCalledByLegalizer())
1631 return SDValue();
1632
1633 // We need to permute the operands to get the correct NaN behavior. The
1634 // selected operand is the second one based on the failing compare with NaN,
1635 // so permute it based on the compare type the hardware uses.
1636 if (LHS == True)
1637 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1638 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1639 }
1640 case ISD::SETUGE:
1641 case ISD::SETUGT: {
1642 if (LHS == True)
1643 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1644 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1645 }
1646 case ISD::SETGT:
1647 case ISD::SETGE:
1648 case ISD::SETOGE:
1649 case ISD::SETOGT: {
1651 !DCI.isCalledByLegalizer())
1652 return SDValue();
1653
1654 if (LHS == True)
1655 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1656 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1657 }
1658 case ISD::SETCC_INVALID:
1659 llvm_unreachable("Invalid setcc condcode!");
1660 }
1661 return SDValue();
1662}
1663
1664/// Generate Min/Max node
1666 SDValue LHS, SDValue RHS,
1667 SDValue True, SDValue False,
1668 SDValue CC,
1669 DAGCombinerInfo &DCI) const {
1670 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1671 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1672
1673 SelectionDAG &DAG = DCI.DAG;
1674
1675 // If we can't directly match this, try to see if we can fold an fneg to
1676 // match.
1677
1678 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1679 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1680 SDValue NegTrue = peekFNeg(True);
1681
1682 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1683 // fmin/fmax.
1684 //
1685 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1686 // -> fneg (fmin_legacy lhs, K)
1687 //
1688 // TODO: Use getNegatedExpression
1689 if (LHS == NegTrue && CFalse && CRHS) {
1690 APFloat NegRHS = neg(CRHS->getValueAPF());
1691 if (NegRHS == CFalse->getValueAPF()) {
1692 SDValue Combined =
1693 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1694 if (Combined)
1695 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1696 return SDValue();
1697 }
1698 }
1699
1700 return SDValue();
1701}
1702
1703std::pair<SDValue, SDValue>
1705 SDLoc SL(Op);
1706
1707 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1708
1709 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1710 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1711
1712 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1713 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1714
1715 return std::pair(Lo, Hi);
1716}
1717
1719 SDLoc SL(Op);
1720
1721 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1722 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1723 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1724}
1725
1727 SDLoc SL(Op);
1728
1729 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1730 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1731 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1732}
1733
1734// Split a vector type into two parts. The first part is a power of two vector.
1735// The second part is whatever is left over, and is a scalar if it would
1736// otherwise be a 1-vector.
1737std::pair<EVT, EVT>
1739 EVT LoVT, HiVT;
1740 EVT EltVT = VT.getVectorElementType();
1741 unsigned NumElts = VT.getVectorNumElements();
1742 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1743 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1744 HiVT = NumElts - LoNumElts == 1
1745 ? EltVT
1746 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1747 return std::pair(LoVT, HiVT);
1748}
1749
1750// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1751// scalar.
1752std::pair<SDValue, SDValue>
1754 const EVT &LoVT, const EVT &HiVT,
1755 SelectionDAG &DAG) const {
1757 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1758 N.getValueType().getVectorNumElements() &&
1759 "More vector elements requested than available!");
1761 DAG.getVectorIdxConstant(0, DL));
1762 SDValue Hi = DAG.getNode(
1764 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1765 return std::pair(Lo, Hi);
1766}
1767
1769 SelectionDAG &DAG) const {
1770 LoadSDNode *Load = cast<LoadSDNode>(Op);
1771 EVT VT = Op.getValueType();
1772 SDLoc SL(Op);
1773
1774
1775 // If this is a 2 element vector, we really want to scalarize and not create
1776 // weird 1 element vectors.
1777 if (VT.getVectorNumElements() == 2) {
1778 SDValue Ops[2];
1779 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1780 return DAG.getMergeValues(Ops, SL);
1781 }
1782
1783 SDValue BasePtr = Load->getBasePtr();
1784 EVT MemVT = Load->getMemoryVT();
1785
1786 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1787
1788 EVT LoVT, HiVT;
1789 EVT LoMemVT, HiMemVT;
1790 SDValue Lo, Hi;
1791
1792 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1793 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1794 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1795
1796 unsigned Size = LoMemVT.getStoreSize();
1797 Align BaseAlign = Load->getAlign();
1798 Align HiAlign = commonAlignment(BaseAlign, Size);
1799
1800 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1801 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1802 BaseAlign, Load->getMemOperand()->getFlags());
1803 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1804 SDValue HiLoad =
1805 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1806 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1807 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1808
1809 SDValue Join;
1810 if (LoVT == HiVT) {
1811 // This is the case that the vector is power of two so was evenly split.
1812 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1813 } else {
1814 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1815 DAG.getVectorIdxConstant(0, SL));
1816 Join = DAG.getNode(
1818 VT, Join, HiLoad,
1820 }
1821
1822 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1823 LoLoad.getValue(1), HiLoad.getValue(1))};
1824
1825 return DAG.getMergeValues(Ops, SL);
1826}
1827
1829 SelectionDAG &DAG) const {
1830 LoadSDNode *Load = cast<LoadSDNode>(Op);
1831 EVT VT = Op.getValueType();
1832 SDValue BasePtr = Load->getBasePtr();
1833 EVT MemVT = Load->getMemoryVT();
1834 SDLoc SL(Op);
1835 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1836 Align BaseAlign = Load->getAlign();
1837 unsigned NumElements = MemVT.getVectorNumElements();
1838
1839 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1840 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1841 if (NumElements != 3 ||
1842 (BaseAlign < Align(8) &&
1843 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1844 return SplitVectorLoad(Op, DAG);
1845
1846 assert(NumElements == 3);
1847
1848 EVT WideVT =
1850 EVT WideMemVT =
1852 SDValue WideLoad = DAG.getExtLoad(
1853 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1854 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1855 return DAG.getMergeValues(
1856 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1857 DAG.getVectorIdxConstant(0, SL)),
1858 WideLoad.getValue(1)},
1859 SL);
1860}
1861
1863 SelectionDAG &DAG) const {
1864 StoreSDNode *Store = cast<StoreSDNode>(Op);
1865 SDValue Val = Store->getValue();
1866 EVT VT = Val.getValueType();
1867
1868 // If this is a 2 element vector, we really want to scalarize and not create
1869 // weird 1 element vectors.
1870 if (VT.getVectorNumElements() == 2)
1871 return scalarizeVectorStore(Store, DAG);
1872
1873 EVT MemVT = Store->getMemoryVT();
1874 SDValue Chain = Store->getChain();
1875 SDValue BasePtr = Store->getBasePtr();
1876 SDLoc SL(Op);
1877
1878 EVT LoVT, HiVT;
1879 EVT LoMemVT, HiMemVT;
1880 SDValue Lo, Hi;
1881
1882 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1883 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1884 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1885
1886 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1887
1888 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1889 Align BaseAlign = Store->getAlign();
1890 unsigned Size = LoMemVT.getStoreSize();
1891 Align HiAlign = commonAlignment(BaseAlign, Size);
1892
1893 SDValue LoStore =
1894 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1895 Store->getMemOperand()->getFlags());
1896 SDValue HiStore =
1897 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1898 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1899
1900 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1901}
1902
1903// This is a shortcut for integer division because we have fast i32<->f32
1904// conversions, and fast f32 reciprocal instructions. The fractional part of a
1905// float is enough to accurately represent up to a 24-bit signed integer.
1907 bool Sign) const {
1908 SDLoc DL(Op);
1909 EVT VT = Op.getValueType();
1910 SDValue LHS = Op.getOperand(0);
1911 SDValue RHS = Op.getOperand(1);
1912 MVT IntVT = MVT::i32;
1913 MVT FltVT = MVT::f32;
1914
1915 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1916 if (LHSSignBits < 9)
1917 return SDValue();
1918
1919 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1920 if (RHSSignBits < 9)
1921 return SDValue();
1922
1923 unsigned BitSize = VT.getSizeInBits();
1924 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1925 unsigned DivBits = BitSize - SignBits;
1926 if (Sign)
1927 ++DivBits;
1928
1931
1932 SDValue jq = DAG.getConstant(1, DL, IntVT);
1933
1934 if (Sign) {
1935 // char|short jq = ia ^ ib;
1936 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1937
1938 // jq = jq >> (bitsize - 2)
1939 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1940 DAG.getConstant(BitSize - 2, DL, VT));
1941
1942 // jq = jq | 0x1
1943 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1944 }
1945
1946 // int ia = (int)LHS;
1947 SDValue ia = LHS;
1948
1949 // int ib, (int)RHS;
1950 SDValue ib = RHS;
1951
1952 // float fa = (float)ia;
1953 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1954
1955 // float fb = (float)ib;
1956 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1957
1958 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1959 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1960
1961 // fq = trunc(fq);
1962 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1963
1964 // float fqneg = -fq;
1965 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1966
1968
1969 bool UseFmadFtz = false;
1970 if (Subtarget->isGCN()) {
1972 UseFmadFtz =
1974 }
1975
1976 // float fr = mad(fqneg, fb, fa);
1977 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1978 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1980 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1981
1982 // int iq = (int)fq;
1983 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1984
1985 // fr = fabs(fr);
1986 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1987
1988 // fb = fabs(fb);
1989 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1990
1991 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1992
1993 // int cv = fr >= fb;
1994 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1995
1996 // jq = (cv ? jq : 0);
1997 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1998
1999 // dst = iq + jq;
2000 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2001
2002 // Rem needs compensation, it's easier to recompute it
2003 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2004 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2005
2006 // Truncate to number of bits this divide really is.
2007 if (Sign) {
2008 SDValue InRegSize
2009 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2010 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2011 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2012 } else {
2013 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2014 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2015 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2016 }
2017
2018 return DAG.getMergeValues({ Div, Rem }, DL);
2019}
2020
2022 SelectionDAG &DAG,
2024 SDLoc DL(Op);
2025 EVT VT = Op.getValueType();
2026
2027 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2028
2029 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2030
2031 SDValue One = DAG.getConstant(1, DL, HalfVT);
2032 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2033
2034 //HiLo split
2035 SDValue LHS_Lo, LHS_Hi;
2036 SDValue LHS = Op.getOperand(0);
2037 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2038
2039 SDValue RHS_Lo, RHS_Hi;
2040 SDValue RHS = Op.getOperand(1);
2041 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2042
2043 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2045
2046 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2047 LHS_Lo, RHS_Lo);
2048
2049 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2050 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2051
2052 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2053 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2054 return;
2055 }
2056
2057 if (isTypeLegal(MVT::i64)) {
2058 // The algorithm here is based on ideas from "Software Integer Division",
2059 // Tom Rodeheffer, August 2008.
2060
2063
2064 // Compute denominator reciprocal.
2065 unsigned FMAD =
2066 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2069 : (unsigned)AMDGPUISD::FMAD_FTZ;
2070
2071 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2072 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2073 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2074 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2075 Cvt_Lo);
2076 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2077 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2078 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2079 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2080 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2081 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2082 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2083 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2084 Mul1);
2085 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2086 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2087 SDValue Rcp64 = DAG.getBitcast(VT,
2088 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2089
2090 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2091 SDValue One64 = DAG.getConstant(1, DL, VT);
2092 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2093 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2094
2095 // First round of UNR (Unsigned integer Newton-Raphson).
2096 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2097 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2098 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2099 SDValue Mulhi1_Lo, Mulhi1_Hi;
2100 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2101 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2102 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2103 Mulhi1_Lo, Zero1);
2104 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2105 Mulhi1_Hi, Add1_Lo.getValue(1));
2106 SDValue Add1 = DAG.getBitcast(VT,
2107 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2108
2109 // Second round of UNR.
2110 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2111 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2112 SDValue Mulhi2_Lo, Mulhi2_Hi;
2113 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2114 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2115 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2116 Mulhi2_Lo, Zero1);
2117 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2118 Mulhi2_Hi, Add2_Lo.getValue(1));
2119 SDValue Add2 = DAG.getBitcast(VT,
2120 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2121
2122 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2123
2124 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2125
2126 SDValue Mul3_Lo, Mul3_Hi;
2127 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2128 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2129 Mul3_Lo, Zero1);
2130 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2131 Mul3_Hi, Sub1_Lo.getValue(1));
2132 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2133 SDValue Sub1 = DAG.getBitcast(VT,
2134 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2135
2136 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2137 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2138 ISD::SETUGE);
2139 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2140 ISD::SETUGE);
2141 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2142
2143 // TODO: Here and below portions of the code can be enclosed into if/endif.
2144 // Currently control flow is unconditional and we have 4 selects after
2145 // potential endif to substitute PHIs.
2146
2147 // if C3 != 0 ...
2148 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2149 RHS_Lo, Zero1);
2150 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2151 RHS_Hi, Sub1_Lo.getValue(1));
2152 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2153 Zero, Sub2_Lo.getValue(1));
2154 SDValue Sub2 = DAG.getBitcast(VT,
2155 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2156
2157 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2158
2159 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2160 ISD::SETUGE);
2161 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2162 ISD::SETUGE);
2163 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2164
2165 // if (C6 != 0)
2166 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2167
2168 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2169 RHS_Lo, Zero1);
2170 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2171 RHS_Hi, Sub2_Lo.getValue(1));
2172 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2173 Zero, Sub3_Lo.getValue(1));
2174 SDValue Sub3 = DAG.getBitcast(VT,
2175 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2176
2177 // endif C6
2178 // endif C3
2179
2180 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2181 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2182
2183 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2184 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2185
2186 Results.push_back(Div);
2187 Results.push_back(Rem);
2188
2189 return;
2190 }
2191
2192 // r600 expandion.
2193 // Get Speculative values
2194 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2195 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2196
2197 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2198 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2199 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2200
2201 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2202 SDValue DIV_Lo = Zero;
2203
2204 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2205
2206 for (unsigned i = 0; i < halfBitWidth; ++i) {
2207 const unsigned bitPos = halfBitWidth - i - 1;
2208 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2209 // Get value of high bit
2210 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2211 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2212 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2213
2214 // Shift
2215 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2216 // Add LHS high bit
2217 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2218
2219 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2220 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2221
2222 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2223
2224 // Update REM
2225 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2226 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2227 }
2228
2229 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2230 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2231 Results.push_back(DIV);
2232 Results.push_back(REM);
2233}
2234
2236 SelectionDAG &DAG) const {
2237 SDLoc DL(Op);
2238 EVT VT = Op.getValueType();
2239
2240 if (VT == MVT::i64) {
2242 LowerUDIVREM64(Op, DAG, Results);
2243 return DAG.getMergeValues(Results, DL);
2244 }
2245
2246 if (VT == MVT::i32) {
2247 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2248 return Res;
2249 }
2250
2251 SDValue X = Op.getOperand(0);
2252 SDValue Y = Op.getOperand(1);
2253
2254 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2255 // algorithm used here.
2256
2257 // Initial estimate of inv(y).
2258 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2259
2260 // One round of UNR.
2261 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2262 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2263 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2264 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2265
2266 // Quotient/remainder estimate.
2267 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2268 SDValue R =
2269 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2270
2271 // First quotient/remainder refinement.
2272 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2273 SDValue One = DAG.getConstant(1, DL, VT);
2274 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2275 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2276 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2277 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2278 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2279
2280 // Second quotient/remainder refinement.
2281 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2282 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2283 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2284 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2285 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2286
2287 return DAG.getMergeValues({Q, R}, DL);
2288}
2289
2291 SelectionDAG &DAG) const {
2292 SDLoc DL(Op);
2293 EVT VT = Op.getValueType();
2294
2295 SDValue LHS = Op.getOperand(0);
2296 SDValue RHS = Op.getOperand(1);
2297
2298 SDValue Zero = DAG.getConstant(0, DL, VT);
2299 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2300
2301 if (VT == MVT::i32) {
2302 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2303 return Res;
2304 }
2305
2306 if (VT == MVT::i64 &&
2307 DAG.ComputeNumSignBits(LHS) > 32 &&
2308 DAG.ComputeNumSignBits(RHS) > 32) {
2309 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2310
2311 //HiLo split
2312 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2313 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2314 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2315 LHS_Lo, RHS_Lo);
2316 SDValue Res[2] = {
2317 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2318 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2319 };
2320 return DAG.getMergeValues(Res, DL);
2321 }
2322
2323 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2324 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2325 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2326 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2327
2328 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2329 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2330
2331 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2332 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2333
2334 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2335 SDValue Rem = Div.getValue(1);
2336
2337 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2338 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2339
2340 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2341 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2342
2343 SDValue Res[2] = {
2344 Div,
2345 Rem
2346 };
2347 return DAG.getMergeValues(Res, DL);
2348}
2349
2350// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2352 SDLoc SL(Op);
2353 EVT VT = Op.getValueType();
2354 auto Flags = Op->getFlags();
2355 SDValue X = Op.getOperand(0);
2356 SDValue Y = Op.getOperand(1);
2357
2358 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2359 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2360 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2361 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2362 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2363}
2364
2366 SDLoc SL(Op);
2367 SDValue Src = Op.getOperand(0);
2368
2369 // result = trunc(src)
2370 // if (src > 0.0 && src != result)
2371 // result += 1.0
2372
2373 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2374
2375 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2376 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2377
2378 EVT SetCCVT =
2379 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2380
2381 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2382 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2383 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2384
2385 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2386 // TODO: Should this propagate fast-math-flags?
2387 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2388}
2389
2391 SelectionDAG &DAG) {
2392 const unsigned FractBits = 52;
2393 const unsigned ExpBits = 11;
2394
2395 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2396 Hi,
2397 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2398 DAG.getConstant(ExpBits, SL, MVT::i32));
2399 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2400 DAG.getConstant(1023, SL, MVT::i32));
2401
2402 return Exp;
2403}
2404
2406 SDLoc SL(Op);
2407 SDValue Src = Op.getOperand(0);
2408
2409 assert(Op.getValueType() == MVT::f64);
2410
2411 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2412
2413 // Extract the upper half, since this is where we will find the sign and
2414 // exponent.
2415 SDValue Hi = getHiHalf64(Src, DAG);
2416
2417 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2418
2419 const unsigned FractBits = 52;
2420
2421 // Extract the sign bit.
2422 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2423 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2424
2425 // Extend back to 64-bits.
2426 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2427 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2428
2429 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2430 const SDValue FractMask
2431 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2432
2433 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2434 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2435 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2436
2437 EVT SetCCVT =
2438 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2439
2440 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2441
2442 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2443 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2444
2445 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2446 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2447
2448 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2449}
2450
2452 SelectionDAG &DAG) const {
2453 SDLoc SL(Op);
2454 SDValue Src = Op.getOperand(0);
2455
2456 assert(Op.getValueType() == MVT::f64);
2457
2458 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2459 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2460 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2461
2462 // TODO: Should this propagate fast-math-flags?
2463
2464 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2465 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2466
2467 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2468
2469 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2470 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2471
2472 EVT SetCCVT =
2473 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2474 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2475
2476 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2477}
2478
2480 SelectionDAG &DAG) const {
2481 // FNEARBYINT and FRINT are the same, except in their handling of FP
2482 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2483 // rint, so just treat them as equivalent.
2484 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2485 Op.getOperand(0));
2486}
2487
2489 auto VT = Op.getValueType();
2490 auto Arg = Op.getOperand(0u);
2491 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2492}
2493
2494// XXX - May require not supporting f32 denormals?
2495
2496// Don't handle v2f16. The extra instructions to scalarize and repack around the
2497// compare and vselect end up producing worse code than scalarizing the whole
2498// operation.
2500 SDLoc SL(Op);
2501 SDValue X = Op.getOperand(0);
2502 EVT VT = Op.getValueType();
2503
2504 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2505
2506 // TODO: Should this propagate fast-math-flags?
2507
2508 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2509
2510 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2511
2512 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2513 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2514
2515 EVT SetCCVT =
2516 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2517
2518 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2519 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2520 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2521
2522 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2523 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2524}
2525
2527 SDLoc SL(Op);
2528 SDValue Src = Op.getOperand(0);
2529
2530 // result = trunc(src);
2531 // if (src < 0.0 && src != result)
2532 // result += -1.0.
2533
2534 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2535
2536 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2537 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2538
2539 EVT SetCCVT =
2540 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2541
2542 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2543 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2544 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2545
2546 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2547 // TODO: Should this propagate fast-math-flags?
2548 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2549}
2550
2551/// Return true if it's known that \p Src can never be an f32 denormal value.
2553 switch (Src.getOpcode()) {
2554 case ISD::FP_EXTEND:
2555 return Src.getOperand(0).getValueType() == MVT::f16;
2556 case ISD::FP16_TO_FP:
2557 case ISD::FFREXP:
2558 return true;
2560 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2561 switch (IntrinsicID) {
2562 case Intrinsic::amdgcn_frexp_mant:
2563 return true;
2564 default:
2565 return false;
2566 }
2567 }
2568 default:
2569 return false;
2570 }
2571
2572 llvm_unreachable("covered opcode switch");
2573}
2574
2576 SDNodeFlags Flags) {
2577 if (Flags.hasApproximateFuncs())
2578 return true;
2579 auto &Options = DAG.getTarget().Options;
2580 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2581}
2582
2584 SDValue Src,
2585 SDNodeFlags Flags) {
2586 return !valueIsKnownNeverF32Denorm(Src) &&
2587 DAG.getMachineFunction()
2590}
2591
2593 SDValue Src,
2594 SDNodeFlags Flags) const {
2595 SDLoc SL(Src);
2596 EVT VT = Src.getValueType();
2598 SDValue SmallestNormal =
2599 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2600
2601 // Want to scale denormals up, but negatives and 0 work just as well on the
2602 // scaled path.
2603 SDValue IsLtSmallestNormal = DAG.getSetCC(
2604 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2605 SmallestNormal, ISD::SETOLT);
2606
2607 return IsLtSmallestNormal;
2608}
2609
2611 SDNodeFlags Flags) const {
2612 SDLoc SL(Src);
2613 EVT VT = Src.getValueType();
2615 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2616
2617 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2618 SDValue IsFinite = DAG.getSetCC(
2619 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2620 Inf, ISD::SETOLT);
2621 return IsFinite;
2622}
2623
2624/// If denormal handling is required return the scaled input to FLOG2, and the
2625/// check for denormal range. Otherwise, return null values.
2626std::pair<SDValue, SDValue>
2628 SDValue Src, SDNodeFlags Flags) const {
2629 if (!needsDenormHandlingF32(DAG, Src, Flags))
2630 return {};
2631
2632 MVT VT = MVT::f32;
2633 const fltSemantics &Semantics = APFloat::IEEEsingle();
2634 SDValue SmallestNormal =
2635 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2636
2637 SDValue IsLtSmallestNormal = DAG.getSetCC(
2638 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2639 SmallestNormal, ISD::SETOLT);
2640
2641 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2642 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2643 SDValue ScaleFactor =
2644 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2645
2646 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2647 return {ScaledInput, IsLtSmallestNormal};
2648}
2649
2651 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2652 // If we have to handle denormals, scale up the input and adjust the result.
2653
2654 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2655 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2656
2657 SDLoc SL(Op);
2658 EVT VT = Op.getValueType();
2659 SDValue Src = Op.getOperand(0);
2660 SDNodeFlags Flags = Op->getFlags();
2661
2662 if (VT == MVT::f16) {
2663 // Nothing in half is a denormal when promoted to f32.
2664 assert(!Subtarget->has16BitInsts());
2665 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2666 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2667 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2668 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2669 }
2670
2671 auto [ScaledInput, IsLtSmallestNormal] =
2672 getScaledLogInput(DAG, SL, Src, Flags);
2673 if (!ScaledInput)
2674 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2675
2676 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2677
2678 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2679 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2680 SDValue ResultOffset =
2681 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2682 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2683}
2684
2685static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2686 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2687 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2688 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2689}
2690
2692 SelectionDAG &DAG) const {
2693 SDValue X = Op.getOperand(0);
2694 EVT VT = Op.getValueType();
2695 SDNodeFlags Flags = Op->getFlags();
2696 SDLoc DL(Op);
2697
2698 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2699 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2700
2701 const auto &Options = getTargetMachine().Options;
2702 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2703 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2704
2705 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2706 // Log and multiply in f32 is good enough for f16.
2707 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2708 }
2709
2710 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2711 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2712 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2713 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2714 }
2715
2716 return Lowered;
2717 }
2718
2719 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2720 if (ScaledInput)
2721 X = ScaledInput;
2722
2723 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2724
2725 SDValue R;
2726 if (Subtarget->hasFastFMAF32()) {
2727 // c+cc are ln(2)/ln(10) to more than 49 bits
2728 const float c_log10 = 0x1.344134p-2f;
2729 const float cc_log10 = 0x1.09f79ep-26f;
2730
2731 // c + cc is ln(2) to more than 49 bits
2732 const float c_log = 0x1.62e42ep-1f;
2733 const float cc_log = 0x1.efa39ep-25f;
2734
2735 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2736 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2737
2738 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2739 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2740 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2741 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2742 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2743 } else {
2744 // ch+ct is ln(2)/ln(10) to more than 36 bits
2745 const float ch_log10 = 0x1.344000p-2f;
2746 const float ct_log10 = 0x1.3509f6p-18f;
2747
2748 // ch + ct is ln(2) to more than 36 bits
2749 const float ch_log = 0x1.62e000p-1f;
2750 const float ct_log = 0x1.0bfbe8p-15f;
2751
2752 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2753 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2754
2755 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2756 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2757 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2758 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2759 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2760
2761 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2762 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2763 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2764 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2765 }
2766
2767 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2768 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2769
2770 // TODO: Check if known finite from source value.
2771 if (!IsFiniteOnly) {
2772 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2773 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2774 }
2775
2776 if (IsScaled) {
2777 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2778 SDValue ShiftK =
2779 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2780 SDValue Shift =
2781 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2782 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2783 }
2784
2785 return R;
2786}
2787
2789 return LowerFLOGCommon(Op, DAG);
2790}
2791
2792// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2793// promote f16 operation.
2795 SelectionDAG &DAG, bool IsLog10,
2796 SDNodeFlags Flags) const {
2797 EVT VT = Src.getValueType();
2798 unsigned LogOp =
2799 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2800
2801 double Log2BaseInverted =
2803
2804 if (VT == MVT::f32) {
2805 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2806 if (ScaledInput) {
2807 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2808 SDValue ScaledResultOffset =
2809 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2810
2811 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2812
2813 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2814 ScaledResultOffset, Zero, Flags);
2815
2816 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2817
2818 if (Subtarget->hasFastFMAF32())
2819 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2820 Flags);
2821 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2822 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2823 }
2824 }
2825
2826 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2827 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2828
2829 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2830 Flags);
2831}
2832
2834 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2835 // If we have to handle denormals, scale up the input and adjust the result.
2836
2837 SDLoc SL(Op);
2838 EVT VT = Op.getValueType();
2839 SDValue Src = Op.getOperand(0);
2840 SDNodeFlags Flags = Op->getFlags();
2841
2842 if (VT == MVT::f16) {
2843 // Nothing in half is a denormal when promoted to f32.
2844 assert(!Subtarget->has16BitInsts());
2845 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2846 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2847 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2848 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2849 }
2850
2851 assert(VT == MVT::f32);
2852
2853 if (!needsDenormHandlingF32(DAG, Src, Flags))
2854 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2855
2856 // bool needs_scaling = x < -0x1.f80000p+6f;
2857 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2858
2859 // -nextafter(128.0, -1)
2860 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2861
2862 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2863
2864 SDValue NeedsScaling =
2865 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2866
2867 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2868 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2869
2870 SDValue AddOffset =
2871 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2872
2873 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2874 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2875
2876 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2877 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2878 SDValue ResultScale =
2879 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2880
2881 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2882}
2883
2885 SelectionDAG &DAG,
2886 SDNodeFlags Flags) const {
2887 EVT VT = X.getValueType();
2888 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2889
2890 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2891 // exp2(M_LOG2E_F * f);
2892 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2893 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2894 : (unsigned)ISD::FEXP2,
2895 SL, VT, Mul, Flags);
2896 }
2897
2898 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2899
2900 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2901 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2902
2903 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2904
2905 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2906
2907 SDValue AdjustedX =
2908 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2909
2910 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2911
2912 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2913
2914 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2915 SDValue AdjustedResult =
2916 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2917
2918 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2919 Flags);
2920}
2921
2922/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2923/// handled correctly.
2925 SelectionDAG &DAG,
2926 SDNodeFlags Flags) const {
2927 const EVT VT = X.getValueType();
2928 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2929
2930 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2931 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2932 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2933 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2934
2935 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2936 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2937 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2938 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2939 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2940 }
2941
2942 // bool s = x < -0x1.2f7030p+5f;
2943 // x += s ? 0x1.0p+5f : 0.0f;
2944 // exp10 = exp2(x * 0x1.a92000p+1f) *
2945 // exp2(x * 0x1.4f0978p-11f) *
2946 // (s ? 0x1.9f623ep-107f : 1.0f);
2947
2948 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2949
2950 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2951 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2952
2953 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2954 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2955 SDValue AdjustedX =
2956 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2957
2958 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2959 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2960
2961 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2962 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2963 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2964 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2965
2966 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2967
2968 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2969 SDValue AdjustedResult =
2970 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2971
2972 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2973 Flags);
2974}
2975
2977 EVT VT = Op.getValueType();
2978 SDLoc SL(Op);
2979 SDValue X = Op.getOperand(0);
2980 SDNodeFlags Flags = Op->getFlags();
2981 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2982
2983 if (VT.getScalarType() == MVT::f16) {
2984 // v_exp_f16 (fmul x, log2e)
2985 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2986 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2987
2988 if (VT.isVector())
2989 return SDValue();
2990
2991 // exp(f16 x) ->
2992 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2993
2994 // Nothing in half is a denormal when promoted to f32.
2995 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2996 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2997 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2998 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2999 }
3000
3001 assert(VT == MVT::f32);
3002
3003 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3004 // library behavior. Also, is known-not-daz source sufficient?
3005 if (allowApproxFunc(DAG, Flags)) {
3006 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3007 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3008 }
3009
3010 // Algorithm:
3011 //
3012 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3013 //
3014 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3015 // n = 64*m + j, 0 <= j < 64
3016 //
3017 // e^x = 2^((64*m + j + f)/64)
3018 // = (2^m) * (2^(j/64)) * 2^(f/64)
3019 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3020 //
3021 // f = x*(64/ln(2)) - n
3022 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3023 //
3024 // e^x = (2^m) * (2^(j/64)) * e^r
3025 //
3026 // (2^(j/64)) is precomputed
3027 //
3028 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3029 // e^r = 1 + q
3030 //
3031 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3032 //
3033 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3034 SDNodeFlags FlagsNoContract = Flags;
3035 FlagsNoContract.setAllowContract(false);
3036
3037 SDValue PH, PL;
3038 if (Subtarget->hasFastFMAF32()) {
3039 const float c_exp = numbers::log2ef;
3040 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3041 const float c_exp10 = 0x1.a934f0p+1f;
3042 const float cc_exp10 = 0x1.2f346ep-24f;
3043
3044 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3045 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3046
3047 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3048 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3049 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3050 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3051 } else {
3052 const float ch_exp = 0x1.714000p+0f;
3053 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3054
3055 const float ch_exp10 = 0x1.a92000p+1f;
3056 const float cl_exp10 = 0x1.4f0978p-11f;
3057
3058 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3059 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3060
3061 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3062 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3063 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3064 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3065 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3066
3067 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3068
3069 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3070 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3071 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3072 }
3073
3074 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3075
3076 // It is unsafe to contract this fsub into the PH multiply.
3077 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3078
3079 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3080 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3081 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3082
3083 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3084
3085 SDValue UnderflowCheckConst =
3086 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3087
3088 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3089 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3090 SDValue Underflow =
3091 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3092
3093 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3094 const auto &Options = getTargetMachine().Options;
3095
3096 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3097 SDValue OverflowCheckConst =
3098 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3099 SDValue Overflow =
3100 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3101 SDValue Inf =
3103 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3104 }
3105
3106 return R;
3107}
3108
3109static bool isCtlzOpc(unsigned Opc) {
3110 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3111}
3112
3113static bool isCttzOpc(unsigned Opc) {
3114 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3115}
3116
3118 SelectionDAG &DAG) const {
3119 auto SL = SDLoc(Op);
3120 auto Arg = Op.getOperand(0u);
3121 auto ResultVT = Op.getValueType();
3122
3123 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3124 return {};
3125
3126 assert(isCtlzOpc(Op.getOpcode()));
3127 assert(ResultVT == Arg.getValueType());
3128
3129 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3130 auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3131 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3132 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3133 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
3134 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3135}
3136
3138 SDLoc SL(Op);
3139 SDValue Src = Op.getOperand(0);
3140
3141 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3142 bool Ctlz = isCtlzOpc(Op.getOpcode());
3143 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3144
3145 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3146 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3147 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3148
3149 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3150 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3151 // (cttz hi:lo) -> (umin (ffbl src), 32)
3152 // (ctlz_zero_undef src) -> (ffbh src)
3153 // (cttz_zero_undef src) -> (ffbl src)
3154
3155 // 64-bit scalar version produce 32-bit result
3156 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3157 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3158 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3159 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3160 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3161 if (!ZeroUndef) {
3162 const SDValue ConstVal = DAG.getConstant(
3163 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3164 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3165 }
3166 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3167 }
3168
3169 SDValue Lo, Hi;
3170 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3171
3172 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3173 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3174
3175 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3176 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3177 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3178 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3179
3180 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3181 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3182 if (Ctlz)
3183 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3184 else
3185 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3186
3187 SDValue NewOpr;
3188 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3189 if (!ZeroUndef) {
3190 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3191 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3192 }
3193
3194 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3195}
3196
3198 bool Signed) const {
3199 // The regular method converting a 64-bit integer to float roughly consists of
3200 // 2 steps: normalization and rounding. In fact, after normalization, the
3201 // conversion from a 64-bit integer to a float is essentially the same as the
3202 // one from a 32-bit integer. The only difference is that it has more
3203 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3204 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3205 // converted into the correct float number. The basic steps for the unsigned
3206 // conversion are illustrated in the following pseudo code:
3207 //
3208 // f32 uitofp(i64 u) {
3209 // i32 hi, lo = split(u);
3210 // // Only count the leading zeros in hi as we have native support of the
3211 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3212 // // reduced to a 32-bit one automatically.
3213 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3214 // u <<= shamt;
3215 // hi, lo = split(u);
3216 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3217 // // convert it as a 32-bit integer and scale the result back.
3218 // return uitofp(hi) * 2^(32 - shamt);
3219 // }
3220 //
3221 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3222 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3223 // converted instead followed by negation based its sign bit.
3224
3225 SDLoc SL(Op);
3226 SDValue Src = Op.getOperand(0);
3227
3228 SDValue Lo, Hi;
3229 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3230 SDValue Sign;
3231 SDValue ShAmt;
3232 if (Signed && Subtarget->isGCN()) {
3233 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3234 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3235 // account. That is, the maximal shift is
3236 // - 32 if Lo and Hi have opposite signs;
3237 // - 33 if Lo and Hi have the same sign.
3238 //
3239 // Or, MaxShAmt = 33 + OppositeSign, where
3240 //
3241 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3242 // - -1 if Lo and Hi have opposite signs; and
3243 // - 0 otherwise.
3244 //
3245 // All in all, ShAmt is calculated as
3246 //
3247 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3248 //
3249 // or
3250 //
3251 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3252 //
3253 // to reduce the critical path.
3254 SDValue OppositeSign = DAG.getNode(
3255 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3256 DAG.getConstant(31, SL, MVT::i32));
3257 SDValue MaxShAmt =
3258 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3259 OppositeSign);
3260 // Count the leading sign bits.
3261 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3262 // Different from unsigned conversion, the shift should be one bit less to
3263 // preserve the sign bit.
3264 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3265 DAG.getConstant(1, SL, MVT::i32));
3266 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3267 } else {
3268 if (Signed) {
3269 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3270 // absolute value first.
3271 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3272 DAG.getConstant(63, SL, MVT::i64));
3273 SDValue Abs =
3274 DAG.getNode(ISD::XOR, SL, MVT::i64,
3275 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3276 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3277 }
3278 // Count the leading zeros.
3279 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3280 // The shift amount for signed integers is [0, 32].
3281 }
3282 // Normalize the given 64-bit integer.
3283 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3284 // Split it again.
3285 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3286 // Calculate the adjust bit for rounding.
3287 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3288 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3289 DAG.getConstant(1, SL, MVT::i32), Lo);
3290 // Get the 32-bit normalized integer.
3291 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3292 // Convert the normalized 32-bit integer into f32.
3293 unsigned Opc =
3294 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3295 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3296
3297 // Finally, need to scale back the converted floating number as the original
3298 // 64-bit integer is converted as a 32-bit one.
3299 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3300 ShAmt);
3301 // On GCN, use LDEXP directly.
3302 if (Subtarget->isGCN())
3303 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3304
3305 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3306 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3307 // exponent is enough to avoid overflowing into the sign bit.
3308 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3309 DAG.getConstant(23, SL, MVT::i32));
3310 SDValue IVal =
3311 DAG.getNode(ISD::ADD, SL, MVT::i32,
3312 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3313 if (Signed) {
3314 // Set the sign bit.
3315 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3316 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3317 DAG.getConstant(31, SL, MVT::i32));
3318 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3319 }
3320 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3321}
3322
3324 bool Signed) const {
3325 SDLoc SL(Op);
3326 SDValue Src = Op.getOperand(0);
3327
3328 SDValue Lo, Hi;
3329 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3330
3332 SL, MVT::f64, Hi);
3333
3334 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3335
3336 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3337 DAG.getConstant(32, SL, MVT::i32));
3338 // TODO: Should this propagate fast-math-flags?
3339 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3340}
3341
3343 SelectionDAG &DAG) const {
3344 // TODO: Factor out code common with LowerSINT_TO_FP.
3345 EVT DestVT = Op.getValueType();
3346 SDValue Src = Op.getOperand(0);
3347 EVT SrcVT = Src.getValueType();
3348
3349 if (SrcVT == MVT::i16) {
3350 if (DestVT == MVT::f16)
3351 return Op;
3352 SDLoc DL(Op);
3353
3354 // Promote src to i32
3355 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3356 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3357 }
3358
3359 if (DestVT == MVT::bf16) {
3360 SDLoc SL(Op);
3361 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3362 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3363 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3364 }
3365
3366 if (SrcVT != MVT::i64)
3367 return Op;
3368
3369 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3370 SDLoc DL(Op);
3371
3372 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3373 SDValue FPRoundFlag =
3374 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3375 SDValue FPRound =
3376 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3377
3378 return FPRound;
3379 }
3380
3381 if (DestVT == MVT::f32)
3382 return LowerINT_TO_FP32(Op, DAG, false);
3383
3384 assert(DestVT == MVT::f64);
3385 return LowerINT_TO_FP64(Op, DAG, false);
3386}
3387
3389 SelectionDAG &DAG) const {
3390 EVT DestVT = Op.getValueType();
3391
3392 SDValue Src = Op.getOperand(0);
3393 EVT SrcVT = Src.getValueType();
3394
3395 if (SrcVT == MVT::i16) {
3396 if (DestVT == MVT::f16)
3397 return Op;
3398
3399 SDLoc DL(Op);
3400 // Promote src to i32
3401 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3402 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3403 }
3404
3405 if (DestVT == MVT::bf16) {
3406 SDLoc SL(Op);
3407 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3408 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3409 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3410 }
3411
3412 if (SrcVT != MVT::i64)
3413 return Op;
3414
3415 // TODO: Factor out code common with LowerUINT_TO_FP.
3416
3417 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3418 SDLoc DL(Op);
3419 SDValue Src = Op.getOperand(0);
3420
3421 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3422 SDValue FPRoundFlag =
3423 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3424 SDValue FPRound =
3425 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3426
3427 return FPRound;
3428 }
3429
3430 if (DestVT == MVT::f32)
3431 return LowerINT_TO_FP32(Op, DAG, true);
3432
3433 assert(DestVT == MVT::f64);
3434 return LowerINT_TO_FP64(Op, DAG, true);
3435}
3436
3438 bool Signed) const {
3439 SDLoc SL(Op);
3440
3441 SDValue Src = Op.getOperand(0);
3442 EVT SrcVT = Src.getValueType();
3443
3444 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3445
3446 // The basic idea of converting a floating point number into a pair of 32-bit
3447 // integers is illustrated as follows:
3448 //
3449 // tf := trunc(val);
3450 // hif := floor(tf * 2^-32);
3451 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3452 // hi := fptoi(hif);
3453 // lo := fptoi(lof);
3454 //
3455 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3456 SDValue Sign;
3457 if (Signed && SrcVT == MVT::f32) {
3458 // However, a 32-bit floating point number has only 23 bits mantissa and
3459 // it's not enough to hold all the significant bits of `lof` if val is
3460 // negative. To avoid the loss of precision, We need to take the absolute
3461 // value after truncating and flip the result back based on the original
3462 // signedness.
3463 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3464 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3465 DAG.getConstant(31, SL, MVT::i32));
3466 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3467 }
3468
3469 SDValue K0, K1;
3470 if (SrcVT == MVT::f64) {
3471 K0 = DAG.getConstantFP(
3472 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3473 SrcVT);
3474 K1 = DAG.getConstantFP(
3475 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3476 SrcVT);
3477 } else {
3478 K0 = DAG.getConstantFP(
3479 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3480 K1 = DAG.getConstantFP(
3481 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3482 }
3483 // TODO: Should this propagate fast-math-flags?
3484 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3485
3486 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3487
3488 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3489
3490 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3492 SL, MVT::i32, FloorMul);
3493 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3494
3495 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3496 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3497
3498 if (Signed && SrcVT == MVT::f32) {
3499 assert(Sign);
3500 // Flip the result based on the signedness, which is either all 0s or 1s.
3501 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3502 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3503 // r := xor(r, sign) - sign;
3504 Result =
3505 DAG.getNode(ISD::SUB, SL, MVT::i64,
3506 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3507 }
3508
3509 return Result;
3510}
3511
3513 SDLoc DL(Op);
3514 SDValue N0 = Op.getOperand(0);
3515
3516 // Convert to target node to get known bits
3517 if (N0.getValueType() == MVT::f32)
3518 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3519
3520 if (getTargetMachine().Options.UnsafeFPMath) {
3521 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3522 return SDValue();
3523 }
3524
3525 assert(N0.getSimpleValueType() == MVT::f64);
3526
3527 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3528 const unsigned ExpMask = 0x7ff;
3529 const unsigned ExpBiasf64 = 1023;
3530 const unsigned ExpBiasf16 = 15;
3531 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3532 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3533 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3534 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3535 DAG.getConstant(32, DL, MVT::i64));
3536 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3537 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3538 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3539 DAG.getConstant(20, DL, MVT::i64));
3540 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3541 DAG.getConstant(ExpMask, DL, MVT::i32));
3542 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3543 // add the f16 bias (15) to get the biased exponent for the f16 format.
3544 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3545 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3546
3547 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3548 DAG.getConstant(8, DL, MVT::i32));
3549 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3550 DAG.getConstant(0xffe, DL, MVT::i32));
3551
3552 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3553 DAG.getConstant(0x1ff, DL, MVT::i32));
3554 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3555
3556 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3557 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3558
3559 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3560 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3561 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3562 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3563
3564 // N = M | (E << 12);
3565 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3566 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3567 DAG.getConstant(12, DL, MVT::i32)));
3568
3569 // B = clamp(1-E, 0, 13);
3570 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3571 One, E);
3572 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3573 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3574 DAG.getConstant(13, DL, MVT::i32));
3575
3576 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3577 DAG.getConstant(0x1000, DL, MVT::i32));
3578
3579 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3580 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3581 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3582 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3583
3584 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3585 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3586 DAG.getConstant(0x7, DL, MVT::i32));
3587 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3588 DAG.getConstant(2, DL, MVT::i32));
3589 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3590 One, Zero, ISD::SETEQ);
3591 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3592 One, Zero, ISD::SETGT);
3593 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3594 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3595
3596 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3597 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3598 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3599 I, V, ISD::SETEQ);
3600
3601 // Extract the sign bit.
3602 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3603 DAG.getConstant(16, DL, MVT::i32));
3604 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3605 DAG.getConstant(0x8000, DL, MVT::i32));
3606
3607 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3608 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3609}
3610
3612 SelectionDAG &DAG) const {
3613 SDValue Src = Op.getOperand(0);
3614 unsigned OpOpcode = Op.getOpcode();
3615 EVT SrcVT = Src.getValueType();
3616 EVT DestVT = Op.getValueType();
3617
3618 // Will be selected natively
3619 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3620 return Op;
3621
3622 if (SrcVT == MVT::bf16) {
3623 SDLoc DL(Op);
3624 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3625 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3626 }
3627
3628 // Promote i16 to i32
3629 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3630 SDLoc DL(Op);
3631
3632 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3633 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3634 }
3635
3636 if (DestVT != MVT::i64)
3637 return Op;
3638
3639 if (SrcVT == MVT::f16 ||
3640 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3641 SDLoc DL(Op);
3642
3643 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3644 unsigned Ext =
3646 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3647 }
3648
3649 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3650 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3651
3652 return SDValue();
3653}
3654
3656 SelectionDAG &DAG) const {
3657 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3658 MVT VT = Op.getSimpleValueType();
3659 MVT ScalarVT = VT.getScalarType();
3660
3661 assert(VT.isVector());
3662
3663 SDValue Src = Op.getOperand(0);
3664 SDLoc DL(Op);
3665
3666 // TODO: Don't scalarize on Evergreen?
3667 unsigned NElts = VT.getVectorNumElements();
3669 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3670
3671 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3672 for (unsigned I = 0; I < NElts; ++I)
3673 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3674
3675 return DAG.getBuildVector(VT, DL, Args);
3676}
3677
3678//===----------------------------------------------------------------------===//
3679// Custom DAG optimizations
3680//===----------------------------------------------------------------------===//
3681
3682static bool isU24(SDValue Op, SelectionDAG &DAG) {
3683 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3684}
3685
3686static bool isI24(SDValue Op, SelectionDAG &DAG) {
3687 EVT VT = Op.getValueType();
3688 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3689 // as unsigned 24-bit values.
3691}
3692
3695 SelectionDAG &DAG = DCI.DAG;
3696 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3697 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3698
3699 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3700 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3701 unsigned NewOpcode = Node24->getOpcode();
3702 if (IsIntrin) {
3703 unsigned IID = Node24->getConstantOperandVal(0);
3704 switch (IID) {
3705 case Intrinsic::amdgcn_mul_i24:
3706 NewOpcode = AMDGPUISD::MUL_I24;
3707 break;
3708 case Intrinsic::amdgcn_mul_u24:
3709 NewOpcode = AMDGPUISD::MUL_U24;
3710 break;
3711 case Intrinsic::amdgcn_mulhi_i24:
3712 NewOpcode = AMDGPUISD::MULHI_I24;
3713 break;
3714 case Intrinsic::amdgcn_mulhi_u24:
3715 NewOpcode = AMDGPUISD::MULHI_U24;
3716 break;
3717 default:
3718 llvm_unreachable("Expected 24-bit mul intrinsic");
3719 }
3720 }
3721
3722 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3723
3724 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3725 // the operands to have other uses, but will only perform simplifications that
3726 // involve bypassing some nodes for this user.
3727 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3728 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3729 if (DemandedLHS || DemandedRHS)
3730 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3731 DemandedLHS ? DemandedLHS : LHS,
3732 DemandedRHS ? DemandedRHS : RHS);
3733
3734 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3735 // operands if this node is the only user.
3736 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3737 return SDValue(Node24, 0);
3738 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3739 return SDValue(Node24, 0);
3740
3741 return SDValue();
3742}
3743
3744template <typename IntTy>
3746 uint32_t Width, const SDLoc &DL) {
3747 if (Width + Offset < 32) {
3748 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3749 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3750 return DAG.getConstant(Result, DL, MVT::i32);
3751 }
3752
3753 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3754}
3755
3756static bool hasVolatileUser(SDNode *Val) {
3757 for (SDNode *U : Val->uses()) {
3758 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3759 if (M->isVolatile())
3760 return true;
3761 }
3762 }
3763
3764 return false;
3765}
3766
3768 // i32 vectors are the canonical memory type.
3769 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3770 return false;
3771
3772 if (!VT.isByteSized())
3773 return false;
3774
3775 unsigned Size = VT.getStoreSize();
3776
3777 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3778 return false;
3779
3780 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3781 return false;
3782
3783 return true;
3784}
3785
3786// Replace load of an illegal type with a store of a bitcast to a friendlier
3787// type.
3789 DAGCombinerInfo &DCI) const {
3790 if (!DCI.isBeforeLegalize())
3791 return SDValue();
3792
3793 LoadSDNode *LN = cast<LoadSDNode>(N);
3794 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3795 return SDValue();
3796
3797 SDLoc SL(N);
3798 SelectionDAG &DAG = DCI.DAG;
3799 EVT VT = LN->getMemoryVT();
3800
3801 unsigned Size = VT.getStoreSize();
3802 Align Alignment = LN->getAlign();
3803 if (Alignment < Size && isTypeLegal(VT)) {
3804 unsigned IsFast;
3805 unsigned AS = LN->getAddressSpace();
3806
3807 // Expand unaligned loads earlier than legalization. Due to visitation order
3808 // problems during legalization, the emitted instructions to pack and unpack
3809 // the bytes again are not eliminated in the case of an unaligned copy.
3811 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3812 if (VT.isVector())
3813 return SplitVectorLoad(SDValue(LN, 0), DAG);
3814
3815 SDValue Ops[2];
3816 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3817
3818 return DAG.getMergeValues(Ops, SDLoc(N));
3819 }
3820
3821 if (!IsFast)
3822 return SDValue();
3823 }
3824
3825 if (!shouldCombineMemoryType(VT))
3826 return SDValue();
3827
3828 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3829
3830 SDValue NewLoad
3831 = DAG.getLoad(NewVT, SL, LN->getChain(),
3832 LN->getBasePtr(), LN->getMemOperand());
3833
3834 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3835 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3836 return SDValue(N, 0);
3837}
3838
3839// Replace store of an illegal type with a store of a bitcast to a friendlier
3840// type.
3842 DAGCombinerInfo &DCI) const {
3843 if (!DCI.isBeforeLegalize())
3844 return SDValue();
3845
3846 StoreSDNode *SN = cast<StoreSDNode>(N);
3847 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3848 return SDValue();
3849
3850 EVT VT = SN->getMemoryVT();
3851 unsigned Size = VT.getStoreSize();
3852
3853 SDLoc SL(N);
3854 SelectionDAG &DAG = DCI.DAG;
3855 Align Alignment = SN->getAlign();
3856 if (Alignment < Size && isTypeLegal(VT)) {
3857 unsigned IsFast;
3858 unsigned AS = SN->getAddressSpace();
3859
3860 // Expand unaligned stores earlier than legalization. Due to visitation
3861 // order problems during legalization, the emitted instructions to pack and
3862 // unpack the bytes again are not eliminated in the case of an unaligned
3863 // copy.
3865 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3866 if (VT.isVector())
3867 return SplitVectorStore(SDValue(SN, 0), DAG);
3868
3869 return expandUnalignedStore(SN, DAG);
3870 }
3871
3872 if (!IsFast)
3873 return SDValue();
3874 }
3875
3876 if (!shouldCombineMemoryType(VT))
3877 return SDValue();
3878
3879 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3880 SDValue Val = SN->getValue();
3881
3882 //DCI.AddToWorklist(Val.getNode());
3883
3884 bool OtherUses = !Val.hasOneUse();
3885 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3886 if (OtherUses) {
3887 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3888 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3889 }
3890
3891 return DAG.getStore(SN->getChain(), SL, CastVal,
3892 SN->getBasePtr(), SN->getMemOperand());
3893}
3894
3895// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3896// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3897// issues.
3899 DAGCombinerInfo &DCI) const {
3900 SelectionDAG &DAG = DCI.DAG;
3901 SDValue N0 = N->getOperand(0);
3902
3903 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3904 // (vt2 (truncate (assertzext vt0:x, vt1)))
3905 if (N0.getOpcode() == ISD::TRUNCATE) {
3906 SDValue N1 = N->getOperand(1);
3907 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3908 SDLoc SL(N);
3909
3910 SDValue Src = N0.getOperand(0);
3911 EVT SrcVT = Src.getValueType();
3912 if (SrcVT.bitsGE(ExtVT)) {
3913 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3914 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3915 }
3916 }
3917
3918 return SDValue();
3919}
3920
3922 SDNode *N, DAGCombinerInfo &DCI) const {
3923 unsigned IID = N->getConstantOperandVal(0);
3924 switch (IID) {
3925 case Intrinsic::amdgcn_mul_i24:
3926 case Intrinsic::amdgcn_mul_u24:
3927 case Intrinsic::amdgcn_mulhi_i24:
3928 case Intrinsic::amdgcn_mulhi_u24:
3929 return simplifyMul24(N, DCI);
3930 case Intrinsic::amdgcn_fract:
3931 case Intrinsic::amdgcn_rsq:
3932 case Intrinsic::amdgcn_rcp_legacy:
3933 case Intrinsic::amdgcn_rsq_legacy:
3934 case Intrinsic::amdgcn_rsq_clamp: {
3935 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3936 SDValue Src = N->getOperand(1);
3937 return Src.isUndef() ? Src : SDValue();
3938 }
3939 case Intrinsic::amdgcn_frexp_exp: {
3940 // frexp_exp (fneg x) -> frexp_exp x
3941 // frexp_exp (fabs x) -> frexp_exp x
3942 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3943 SDValue Src = N->getOperand(1);
3944 SDValue PeekSign = peekFPSignOps(Src);
3945 if (PeekSign == Src)
3946 return SDValue();
3947 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3948 0);
3949 }
3950 default:
3951 return SDValue();
3952 }
3953}
3954
3955/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3956/// binary operation \p Opc to it with the corresponding constant operands.
3958 DAGCombinerInfo &DCI, const SDLoc &SL,
3959 unsigned Opc, SDValue LHS,
3960 uint32_t ValLo, uint32_t ValHi) const {
3961 SelectionDAG &DAG = DCI.DAG;
3962 SDValue Lo, Hi;
3963 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3964
3965 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3966 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3967
3968 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3969 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3970
3971 // Re-visit the ands. It's possible we eliminated one of them and it could
3972 // simplify the vector.
3973 DCI.AddToWorklist(Lo.getNode());
3974 DCI.AddToWorklist(Hi.getNode());
3975
3976 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3977 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3978}
3979
3981 DAGCombinerInfo &DCI) const {
3982 EVT VT = N->getValueType(0);
3983
3984 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3985 if (!RHS)
3986 return SDValue();
3987
3988 SDValue LHS = N->getOperand(0);
3989 unsigned RHSVal = RHS->getZExtValue();
3990 if (!RHSVal)
3991 return LHS;
3992
3993 SDLoc SL(N);
3994 SelectionDAG &DAG = DCI.DAG;
3995
3996 switch (LHS->getOpcode()) {
3997 default:
3998 break;
3999 case ISD::ZERO_EXTEND:
4000 case ISD::SIGN_EXTEND:
4001 case ISD::ANY_EXTEND: {
4002 SDValue X = LHS->getOperand(0);
4003
4004 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4005 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4006 // Prefer build_vector as the canonical form if packed types are legal.
4007 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4008 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
4009 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
4010 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4011 }
4012
4013 // shl (ext x) => zext (shl x), if shift does not overflow int
4014 if (VT != MVT::i64)
4015 break;
4016 KnownBits Known = DAG.computeKnownBits(X);
4017 unsigned LZ = Known.countMinLeadingZeros();
4018 if (LZ < RHSVal)
4019 break;
4020 EVT XVT = X.getValueType();
4021 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4022 return DAG.getZExtOrTrunc(Shl, SL, VT);
4023 }
4024 }
4025
4026 if (VT != MVT::i64)
4027 return SDValue();
4028
4029 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4030
4031 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4032 // common case, splitting this into a move and a 32-bit shift is faster and
4033 // the same code size.
4034 if (RHSVal < 32)
4035 return SDValue();
4036
4037 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4038
4039 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4040 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4041
4042 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4043
4044 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4045 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4046}
4047
4049 DAGCombinerInfo &DCI) const {
4050 if (N->getValueType(0) != MVT::i64)
4051 return SDValue();
4052
4053 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4054 if (!RHS)
4055 return SDValue();
4056
4057 SelectionDAG &DAG = DCI.DAG;
4058 SDLoc SL(N);
4059 unsigned RHSVal = RHS->getZExtValue();
4060
4061 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4062 if (RHSVal == 32) {
4063 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4064 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4065 DAG.getConstant(31, SL, MVT::i32));
4066
4067 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4068 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4069 }
4070
4071 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4072 if (RHSVal == 63) {
4073 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4074 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4075 DAG.getConstant(31, SL, MVT::i32));
4076 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4077 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4078 }
4079
4080 return SDValue();
4081}
4082
4084 DAGCombinerInfo &DCI) const {
4085 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4086 if (!RHS)
4087 return SDValue();
4088
4089 EVT VT = N->getValueType(0);
4090 SDValue LHS = N->getOperand(0);
4091 unsigned ShiftAmt = RHS->getZExtValue();
4092 SelectionDAG &DAG = DCI.DAG;
4093 SDLoc SL(N);
4094
4095 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4096 // this improves the ability to match BFE patterns in isel.
4097 if (LHS.getOpcode() == ISD::AND) {
4098 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4099 unsigned MaskIdx, MaskLen;
4100 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4101 MaskIdx == ShiftAmt) {
4102 return DAG.getNode(
4103 ISD::AND, SL, VT,
4104 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4105 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4106 }
4107 }
4108 }
4109
4110 if (VT != MVT::i64)
4111 return SDValue();
4112
4113 if (ShiftAmt < 32)
4114 return SDValue();
4115
4116 // srl i64:x, C for C >= 32
4117 // =>
4118 // build_pair (srl hi_32(x), C - 32), 0
4119 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4120
4121 SDValue Hi = getHiHalf64(LHS, DAG);
4122
4123 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4124 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4125
4126 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4127
4128 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4129}
4130
4132 SDNode *N, DAGCombinerInfo &DCI) const {
4133 SDLoc SL(N);
4134 SelectionDAG &DAG = DCI.DAG;
4135 EVT VT = N->getValueType(0);
4136 SDValue Src = N->getOperand(0);
4137
4138 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4139 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4140 SDValue Vec = Src.getOperand(0);
4141 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4142 SDValue Elt0 = Vec.getOperand(0);
4143 EVT EltVT = Elt0.getValueType();
4144 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4145 if (EltVT.isFloatingPoint()) {
4146 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4147 EltVT.changeTypeToInteger(), Elt0);
4148 }
4149
4150 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4151 }
4152 }
4153 }
4154
4155 // Equivalent of above for accessing the high element of a vector as an
4156 // integer operation.
4157 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4158 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4159 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4160 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4161 SDValue BV = stripBitcast(Src.getOperand(0));
4162 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4163 BV.getValueType().getVectorNumElements() == 2) {
4164 SDValue SrcElt = BV.getOperand(1);
4165 EVT SrcEltVT = SrcElt.getValueType();
4166 if (SrcEltVT.isFloatingPoint()) {
4167 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4168 SrcEltVT.changeTypeToInteger(), SrcElt);
4169 }
4170
4171 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4172 }
4173 }
4174 }
4175 }
4176
4177 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4178 //
4179 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4180 // i16 (trunc (srl (i32 (trunc x), K)))
4181 if (VT.getScalarSizeInBits() < 32) {
4182 EVT SrcVT = Src.getValueType();
4183 if (SrcVT.getScalarSizeInBits() > 32 &&
4184 (Src.getOpcode() == ISD::SRL ||
4185 Src.getOpcode() == ISD::SRA ||
4186 Src.getOpcode() == ISD::SHL)) {
4187 SDValue Amt = Src.getOperand(1);
4188 KnownBits Known = DAG.computeKnownBits(Amt);
4189
4190 // - For left shifts, do the transform as long as the shift
4191 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4192 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4193 // losing information stored in the high bits when truncating.
4194 const unsigned MaxCstSize =
4195 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4196 if (Known.getMaxValue().ule(MaxCstSize)) {
4197 EVT MidVT = VT.isVector() ?
4198 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4199 VT.getVectorNumElements()) : MVT::i32;
4200
4201 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4202 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4203 Src.getOperand(0));
4204 DCI.AddToWorklist(Trunc.getNode());
4205
4206 if (Amt.getValueType() != NewShiftVT) {
4207 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4208 DCI.AddToWorklist(Amt.getNode());
4209 }
4210
4211 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4212 Trunc, Amt);
4213 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4214 }
4215 }
4216 }
4217
4218 return SDValue();
4219}
4220
4221// We need to specifically handle i64 mul here to avoid unnecessary conversion
4222// instructions. If we only match on the legalized i64 mul expansion,
4223// SimplifyDemandedBits will be unable to remove them because there will be
4224// multiple uses due to the separate mul + mulh[su].
4225static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4226 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4227 if (Size <= 32) {
4228 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4229 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4230 }
4231
4232 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4233 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4234
4235 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4236 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4237
4238 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4239}
4240
4241/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4242/// return SDValue().
4243static SDValue getAddOneOp(const SDNode *V) {
4244 if (V->getOpcode() != ISD::ADD)
4245 return SDValue();
4246
4247 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4248}
4249
4251 DAGCombinerInfo &DCI) const {
4252 assert(N->getOpcode() == ISD::MUL);
4253 EVT VT = N->getValueType(0);
4254
4255 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4256 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4257 // unnecessarily). isDivergent() is used as an approximation of whether the
4258 // value is in an SGPR.
4259 if (!N->isDivergent())
4260 return SDValue();
4261
4262 unsigned Size = VT.getSizeInBits();
4263 if (VT.isVector() || Size > 64)
4264 return SDValue();
4265
4266 SelectionDAG &DAG = DCI.DAG;
4267 SDLoc DL(N);
4268
4269 SDValue N0 = N->getOperand(0);
4270 SDValue N1 = N->getOperand(1);
4271
4272 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4273 // matching.
4274
4275 // mul x, (add y, 1) -> add (mul x, y), x
4276 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4277 SDValue AddOp = getAddOneOp(V.getNode());
4278 if (!AddOp)
4279 return SDValue();
4280
4281 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4282 return U->getOpcode() == ISD::MUL;
4283 }))
4284 return AddOp;
4285
4286 return SDValue();
4287 };
4288
4289 // FIXME: The selection pattern is not properly checking for commuted
4290 // operands, so we have to place the mul in the LHS
4291 if (SDValue MulOper = IsFoldableAdd(N0)) {
4292 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4293 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4294 }
4295
4296 if (SDValue MulOper = IsFoldableAdd(N1)) {
4297 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4298 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4299 }
4300
4301 // There are i16 integer mul/mad.
4302 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4303 return SDValue();
4304
4305 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4306 // in the source into any_extends if the result of the mul is truncated. Since
4307 // we can assume the high bits are whatever we want, use the underlying value
4308 // to avoid the unknown high bits from interfering.
4309 if (N0.getOpcode() == ISD::ANY_EXTEND)
4310 N0 = N0.getOperand(0);
4311
4312 if (N1.getOpcode() == ISD::ANY_EXTEND)
4313 N1 = N1.getOperand(0);
4314
4315 SDValue Mul;
4316
4317 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4318 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4319 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4320 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4321 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4322 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4323 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4324 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4325 } else {
4326 return SDValue();
4327 }
4328
4329 // We need to use sext even for MUL_U24, because MUL_U24 is used
4330 // for signed multiply of 8 and 16-bit types.
4331 return DAG.getSExtOrTrunc(Mul, DL, VT);
4332}
4333
4334SDValue
4336 DAGCombinerInfo &DCI) const {
4337 if (N->getValueType(0) != MVT::i32)
4338 return SDValue();
4339
4340 SelectionDAG &DAG = DCI.DAG;
4341 SDLoc DL(N);
4342
4343 SDValue N0 = N->getOperand(0);
4344 SDValue N1 = N->getOperand(1);
4345
4346 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4347 // in the source into any_extends if the result of the mul is truncated. Since
4348 // we can assume the high bits are whatever we want, use the underlying value
4349 // to avoid the unknown high bits from interfering.
4350 if (N0.getOpcode() == ISD::ANY_EXTEND)
4351 N0 = N0.getOperand(0);
4352 if (N1.getOpcode() == ISD::ANY_EXTEND)
4353 N1 = N1.getOperand(0);
4354
4355 // Try to use two fast 24-bit multiplies (one for each half of the result)
4356 // instead of one slow extending multiply.
4357 unsigned LoOpcode, HiOpcode;
4358 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4359 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4360 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4361 LoOpcode = AMDGPUISD::MUL_U24;
4362 HiOpcode = AMDGPUISD::MULHI_U24;
4363 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4364 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4365 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4366 LoOpcode = AMDGPUISD::MUL_I24;
4367 HiOpcode = AMDGPUISD::MULHI_I24;
4368 } else {
4369 return SDValue();
4370 }
4371
4372 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4373 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4374 DCI.CombineTo(N, Lo, Hi);
4375 return SDValue(N, 0);
4376}
4377
4379 DAGCombinerInfo &DCI) const {
4380 EVT VT = N->getValueType(0);
4381
4382 if (!Subtarget->hasMulI24() || VT.isVector())
4383 return SDValue();
4384
4385 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4386 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4387 // unnecessarily). isDivergent() is used as an approximation of whether the
4388 // value is in an SGPR.
4389 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4390 // valu op anyway)
4391 if (Subtarget->hasSMulHi() && !N->isDivergent())
4392 return SDValue();
4393
4394 SelectionDAG &DAG = DCI.DAG;
4395 SDLoc DL(N);
4396
4397 SDValue N0 = N->getOperand(0);
4398 SDValue N1 = N->getOperand(1);
4399
4400 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4401 return SDValue();
4402
4403 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4404 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4405
4406 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4407 DCI.AddToWorklist(Mulhi.getNode());
4408 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4409}
4410
4412 DAGCombinerInfo &DCI) const {
4413 EVT VT = N->getValueType(0);
4414
4415 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4416 return SDValue();
4417
4418 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4419 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4420 // unnecessarily). isDivergent() is used as an approximation of whether the
4421 // value is in an SGPR.
4422 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4423 // valu op anyway)
4424 if (Subtarget->hasSMulHi() && !N->isDivergent())
4425 return SDValue();
4426
4427 SelectionDAG &DAG = DCI.DAG;
4428 SDLoc DL(N);
4429
4430 SDValue N0 = N->getOperand(0);
4431 SDValue N1 = N->getOperand(1);
4432
4433 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4434 return SDValue();
4435
4436 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4437 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4438
4439 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4440 DCI.AddToWorklist(Mulhi.getNode());
4441 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4442}
4443
4444SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4445 SDValue Op,
4446 const SDLoc &DL,
4447 unsigned Opc) const {
4448 EVT VT = Op.getValueType();
4449 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4450 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4451 LegalVT != MVT::i16))
4452 return SDValue();
4453
4454 if (VT != MVT::i32)
4455 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4456
4457 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4458 if (VT != MVT::i32)
4459 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4460
4461 return FFBX;
4462}
4463
4464// The native instructions return -1 on 0 input. Optimize out a select that
4465// produces -1 on 0.
4466//
4467// TODO: If zero is not undef, we could also do this if the output is compared
4468// against the bitwidth.
4469//
4470// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4472 SDValue LHS, SDValue RHS,
4473 DAGCombinerInfo &DCI) const {
4474 if (!isNullConstant(Cond.getOperand(1)))
4475 return SDValue();
4476
4477 SelectionDAG &DAG = DCI.DAG;
4478 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4479 SDValue CmpLHS = Cond.getOperand(0);
4480
4481 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4482 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4483 if (CCOpcode == ISD::SETEQ &&
4484 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4485 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4486 unsigned Opc =
4488 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4489 }
4490
4491 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4492 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4493 if (CCOpcode == ISD::SETNE &&
4494 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4495 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4496 unsigned Opc =
4498
4499 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4500 }
4501
4502 return SDValue();
4503}
4504
4506 unsigned Op,
4507 const SDLoc &SL,
4508 SDValue Cond,
4509 SDValue N1,
4510 SDValue N2) {
4511 SelectionDAG &DAG = DCI.DAG;
4512 EVT VT = N1.getValueType();
4513
4514 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4515 N1.getOperand(0), N2.getOperand(0));
4516 DCI.AddToWorklist(NewSelect.getNode());
4517 return DAG.getNode(Op, SL, VT, NewSelect);
4518}
4519
4520// Pull a free FP operation out of a select so it may fold into uses.
4521//
4522// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4523// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4524//
4525// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4526// select c, (fabs x), +k -> fabs (select c, x, k)
4527SDValue
4529 SDValue N) const {
4530 SelectionDAG &DAG = DCI.DAG;
4531 SDValue Cond = N.getOperand(0);
4532 SDValue LHS = N.getOperand(1);
4533 SDValue RHS = N.getOperand(2);
4534
4535 EVT VT = N.getValueType();
4536 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4537 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4539 return SDValue();
4540
4541 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4542 SDLoc(N), Cond, LHS, RHS);
4543 }
4544
4545 bool Inv = false;
4546 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4547 std::swap(LHS, RHS);
4548 Inv = true;
4549 }
4550
4551 // TODO: Support vector constants.
4552 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4553 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4554 !selectSupportsSourceMods(N.getNode())) {
4555 SDLoc SL(N);
4556 // If one side is an fneg/fabs and the other is a constant, we can push the
4557 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4558 SDValue NewLHS = LHS.getOperand(0);
4559 SDValue NewRHS = RHS;
4560
4561 // Careful: if the neg can be folded up, don't try to pull it back down.
4562 bool ShouldFoldNeg = true;
4563
4564 if (NewLHS.hasOneUse()) {
4565 unsigned Opc = NewLHS.getOpcode();
4566 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4567 ShouldFoldNeg = false;
4568 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4569 ShouldFoldNeg = false;
4570 }
4571
4572 if (ShouldFoldNeg) {
4573 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4574 return SDValue();
4575
4576 // We're going to be forced to use a source modifier anyway, there's no
4577 // point to pulling the negate out unless we can get a size reduction by
4578 // negating the constant.
4579 //
4580 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4581 // about cheaper constants.
4582 if (NewLHS.getOpcode() == ISD::FABS &&
4584 return SDValue();
4585
4587 return SDValue();
4588
4589 if (LHS.getOpcode() == ISD::FNEG)
4590 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4591
4592 if (Inv)
4593 std::swap(NewLHS, NewRHS);
4594
4595 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4596 Cond, NewLHS, NewRHS);
4597 DCI.AddToWorklist(NewSelect.getNode());
4598 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4599 }
4600 }
4601
4602 return SDValue();
4603}
4604
4606 DAGCombinerInfo &DCI) const {
4607 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4608 return Folded;
4609
4610 SDValue Cond = N->getOperand(0);
4611 if (Cond.getOpcode() != ISD::SETCC)
4612 return SDValue();
4613
4614 EVT VT = N->getValueType(0);
4615 SDValue LHS = Cond.getOperand(0);
4616 SDValue RHS = Cond.getOperand(1);
4617 SDValue CC = Cond.getOperand(2);
4618
4619 SDValue True = N->getOperand(1);
4620 SDValue False = N->getOperand(2);
4621
4622 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4623 SelectionDAG &DAG = DCI.DAG;
4624 if (DAG.isConstantValueOfAnyType(True) &&
4625 !DAG.isConstantValueOfAnyType(False)) {
4626 // Swap cmp + select pair to move constant to false input.
4627 // This will allow using VOPC cndmasks more often.
4628 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4629
4630 SDLoc SL(N);
4631 ISD::CondCode NewCC =
4632 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4633
4634 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4635 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4636 }
4637
4638 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4640 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4641 // Revisit this node so we can catch min3/max3/med3 patterns.
4642 //DCI.AddToWorklist(MinMax.getNode());
4643 return MinMax;
4644 }
4645 }
4646
4647 // There's no reason to not do this if the condition has other uses.
4648 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4649}
4650
4651static bool isInv2Pi(const APFloat &APF) {
4652 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4653 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4654 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4655
4656 return APF.bitwiseIsEqual(KF16) ||
4657 APF.bitwiseIsEqual(KF32) ||
4658 APF.bitwiseIsEqual(KF64);
4659}
4660
4661// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4662// additional cost to negate them.
4665 if (C->isZero())
4666 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4667
4668 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4669 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4670
4672}
4673
4677 return false;
4678}
4679
4683 return false;
4684}
4685
4686static unsigned inverseMinMax(unsigned Opc) {
4687 switch (Opc) {
4688 case ISD::FMAXNUM:
4689 return ISD::FMINNUM;
4690 case ISD::FMINNUM:
4691 return ISD::FMAXNUM;
4692 case ISD::FMAXNUM_IEEE:
4693 return ISD::FMINNUM_IEEE;
4694 case ISD::FMINNUM_IEEE:
4695 return ISD::FMAXNUM_IEEE;
4696 case ISD::FMAXIMUM:
4697 return ISD::FMINIMUM;
4698 case ISD::FMINIMUM:
4699 return ISD::FMAXIMUM;
4704 default:
4705 llvm_unreachable("invalid min/max opcode");
4706 }
4707}
4708
4709/// \return true if it's profitable to try to push an fneg into its source
4710/// instruction.
4712 // If the input has multiple uses and we can either fold the negate down, or
4713 // the other uses cannot, give up. This both prevents unprofitable
4714 // transformations and infinite loops: we won't repeatedly try to fold around
4715 // a negate that has no 'good' form.
4716 if (N0.hasOneUse()) {
4717 // This may be able to fold into the source, but at a code size cost. Don't
4718 // fold if the fold into the user is free.
4719 if (allUsesHaveSourceMods(N, 0))
4720 return false;
4721 } else {
4722 if (fnegFoldsIntoOp(N0.getNode()) &&
4724 return false;
4725 }
4726
4727 return true;
4728}
4729
4731 DAGCombinerInfo &DCI) const {
4732 SelectionDAG &DAG = DCI.DAG;
4733 SDValue N0 = N->getOperand(0);
4734 EVT VT = N->getValueType(0);
4735
4736 unsigned Opc = N0.getOpcode();
4737
4738 if (!shouldFoldFNegIntoSrc(N, N0))
4739 return SDValue();
4740
4741 SDLoc SL(N);
4742 switch (Opc) {
4743 case ISD::FADD: {
4744 if (!mayIgnoreSignedZero(N0))
4745 return SDValue();
4746
4747 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4748 SDValue LHS = N0.getOperand(0);
4749 SDValue RHS = N0.getOperand(1);
4750
4751 if (LHS.getOpcode() != ISD::FNEG)
4752 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4753 else
4754 LHS = LHS.getOperand(0);
4755
4756 if (RHS.getOpcode() != ISD::FNEG)
4757 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4758 else
4759 RHS = RHS.getOperand(0);
4760
4761 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4762 if (Res.getOpcode() != ISD::FADD)
4763 return SDValue(); // Op got folded away.
4764 if (!N0.hasOneUse())
4765 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4766 return Res;
4767 }
4768 case ISD::FMUL:
4770 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4771 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4772 SDValue LHS = N0.getOperand(0);
4773 SDValue RHS = N0.getOperand(1);
4774
4775 if (LHS.getOpcode() == ISD::FNEG)
4776 LHS = LHS.getOperand(0);
4777 else if (RHS.getOpcode() == ISD::FNEG)
4778 RHS = RHS.getOperand(0);
4779 else
4780 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4781
4782 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4783 if (Res.getOpcode() != Opc)
4784 return SDValue(); // Op got folded away.
4785 if (!N0.hasOneUse())
4786 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4787 return Res;
4788 }
4789 case ISD::FMA:
4790 case ISD::FMAD: {
4791 // TODO: handle llvm.amdgcn.fma.legacy
4792 if (!mayIgnoreSignedZero(N0))
4793 return SDValue();
4794
4795 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4796 SDValue LHS = N0.getOperand(0);
4797 SDValue MHS = N0.getOperand(1);
4798 SDValue RHS = N0.getOperand(2);
4799
4800 if (LHS.getOpcode() == ISD::FNEG)
4801 LHS = LHS.getOperand(0);
4802 else if (MHS.getOpcode() == ISD::FNEG)
4803 MHS = MHS.getOperand(0);
4804 else
4805 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4806
4807 if (RHS.getOpcode() != ISD::FNEG)
4808 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4809 else
4810 RHS = RHS.getOperand(0);
4811
4812 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4813 if (Res.getOpcode() != Opc)
4814 return SDValue(); // Op got folded away.
4815 if (!N0.hasOneUse())
4816 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4817 return Res;
4818 }
4819 case ISD::FMAXNUM:
4820 case ISD::FMINNUM:
4821 case ISD::FMAXNUM_IEEE:
4822 case ISD::FMINNUM_IEEE:
4823 case ISD::FMINIMUM:
4824 case ISD::FMAXIMUM:
4827 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4828 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4829 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4830 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4831
4832 SDValue LHS = N0.getOperand(0);
4833 SDValue RHS = N0.getOperand(1);
4834
4835 // 0 doesn't have a negated inline immediate.
4836 // TODO: This constant check should be generalized to other operations.
4838 return SDValue();
4839
4840 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4841 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4842 unsigned Opposite = inverseMinMax(Opc);
4843
4844 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4845 if (Res.getOpcode() != Opposite)
4846 return SDValue(); // Op got folded away.
4847 if (!N0.hasOneUse())
4848 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4849 return Res;
4850 }
4851 case AMDGPUISD::FMED3: {
4852 SDValue Ops[3];
4853 for (unsigned I = 0; I < 3; ++I)
4854 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4855
4856 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4857 if (Res.getOpcode() != AMDGPUISD::FMED3)
4858 return SDValue(); // Op got folded away.
4859
4860 if (!N0.hasOneUse()) {
4861 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4862 DAG.ReplaceAllUsesWith(N0, Neg);
4863
4864 for (SDNode *U : Neg->uses())
4865 DCI.AddToWorklist(U);
4866 }
4867
4868 return Res;
4869 }
4870 case ISD::FP_EXTEND:
4871 case ISD::FTRUNC:
4872 case ISD::FRINT:
4873 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4874 case ISD::FROUNDEVEN:
4875 case ISD::FSIN:
4876 case ISD::FCANONICALIZE:
4877 case AMDGPUISD::RCP:
4880 case AMDGPUISD::SIN_HW: {
4881 SDValue CvtSrc = N0.getOperand(0);
4882 if (CvtSrc.getOpcode() == ISD::FNEG) {
4883 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4884 // (fneg (rcp (fneg x))) -> (rcp x)
4885 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4886 }
4887
4888 if (!N0.hasOneUse())
4889 return SDValue();
4890
4891 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4892 // (fneg (rcp x)) -> (rcp (fneg x))
4893 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4894 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4895 }
4896 case ISD::FP_ROUND: {
4897 SDValue CvtSrc = N0.getOperand(0);
4898
4899 if (CvtSrc.getOpcode() == ISD::FNEG) {
4900 // (fneg (fp_round (fneg x))) -> (fp_round x)
4901 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4902 CvtSrc.getOperand(0), N0.getOperand(1));
4903 }
4904
4905 if (!N0.hasOneUse())
4906 return SDValue();
4907
4908 // (fneg (fp_round x)) -> (fp_round (fneg x))
4909 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4910 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4911 }
4912 case ISD::FP16_TO_FP: {
4913 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4914 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4915 // Put the fneg back as a legal source operation that can be matched later.
4916 SDLoc SL(N);
4917
4918 SDValue Src = N0.getOperand(0);
4919 EVT SrcVT = Src.getValueType();
4920
4921 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4922 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4923 DAG.getConstant(0x8000, SL, SrcVT));
4924 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4925 }
4926 case ISD::SELECT: {
4927 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4928 // TODO: Invert conditions of foldFreeOpFromSelect
4929 return SDValue();
4930 }
4931 case ISD::BITCAST: {
4932 SDLoc SL(N);
4933 SDValue BCSrc = N0.getOperand(0);
4934 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4935 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4936 if (HighBits.getValueType().getSizeInBits() != 32 ||
4937 !fnegFoldsIntoOp(HighBits.getNode()))
4938 return SDValue();
4939
4940 // f64 fneg only really needs to operate on the high half of of the
4941 // register, so try to force it to an f32 operation to help make use of
4942 // source modifiers.
4943 //
4944 //
4945 // fneg (f64 (bitcast (build_vector x, y))) ->
4946 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4947 // (fneg (bitcast i32:y to f32)))
4948
4949 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4950 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4951 SDValue CastBack =
4952 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4953
4954 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4955 Ops.back() = CastBack;
4956 DCI.AddToWorklist(NegHi.getNode());
4957 SDValue Build =
4958 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4959 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4960
4961 if (!N0.hasOneUse())
4962 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4963 return Result;
4964 }
4965
4966 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4967 BCSrc.hasOneUse()) {
4968 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4969 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4970
4971 // TODO: Cast back result for multiple uses is beneficial in some cases.
4972
4973 SDValue LHS =
4974 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4975 SDValue RHS =
4976 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4977
4978 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4979 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4980
4981 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4982 NegRHS);
4983 }
4984
4985 return SDValue();
4986 }
4987 default:
4988 return SDValue();
4989 }
4990}
4991
4993 DAGCombinerInfo &DCI) const {
4994 SelectionDAG &DAG = DCI.DAG;
4995 SDValue N0 = N->getOperand(0);
4996
4997 if (!N0.hasOneUse())
4998 return SDValue();
4999
5000 switch (N0.getOpcode()) {
5001 case ISD::FP16_TO_FP: {
5002 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5003 SDLoc SL(N);
5004 SDValue Src = N0.getOperand(0);
5005 EVT SrcVT = Src.getValueType();
5006
5007 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5008 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5009 DAG.getConstant(0x7fff, SL, SrcVT));
5010 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5011 }
5012 default:
5013 return SDValue();
5014 }
5015}
5016
5018 DAGCombinerInfo &DCI) const {
5019 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5020 if (!CFP)
5021 return SDValue();
5022
5023 // XXX - Should this flush denormals?
5024 const APFloat &Val = CFP->getValueAPF();
5025 APFloat One(Val.getSemantics(), "1.0");
5026 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5027}
5028
5030 DAGCombinerInfo &DCI) const {
5031 SelectionDAG &DAG = DCI.DAG;
5032 SDLoc DL(N);
5033
5034 switch(N->getOpcode()) {
5035 default:
5036 break;
5037 case ISD::BITCAST: {
5038 EVT DestVT = N->getValueType(0);
5039
5040 // Push casts through vector builds. This helps avoid emitting a large
5041 // number of copies when materializing floating point vector constants.
5042 //
5043 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5044 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5045 if (DestVT.isVector()) {
5046 SDValue Src = N->getOperand(0);
5047 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5050 EVT SrcVT = Src.getValueType();
5051 unsigned NElts = DestVT.getVectorNumElements();
5052
5053 if (SrcVT.getVectorNumElements() == NElts) {
5054 EVT DestEltVT = DestVT.getVectorElementType();
5055
5056 SmallVector<SDValue, 8> CastedElts;
5057 SDLoc SL(N);
5058 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5059 SDValue Elt = Src.getOperand(I);
5060 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5061 }
5062
5063 return DAG.getBuildVector(DestVT, SL, CastedElts);
5064 }
5065 }
5066 }
5067
5068 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5069 break;
5070
5071 // Fold bitcasts of constants.
5072 //
5073 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5074 // TODO: Generalize and move to DAGCombiner
5075 SDValue Src = N->getOperand(0);
5076 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5077 SDLoc SL(N);
5078 uint64_t CVal = C->getZExtValue();
5079 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5080 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5081 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5082 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5083 }
5084
5085 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5086 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5087 SDLoc SL(N);
5088 uint64_t CVal = Val.getZExtValue();
5089 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5090 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5091 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5092
5093 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5094 }
5095
5096 break;
5097 }
5098 case ISD::SHL: {
5100 break;
5101
5102 return performShlCombine(N, DCI);
5103 }
5104 case ISD::SRL: {
5106 break;
5107
5108 return performSrlCombine(N, DCI);
5109 }
5110 case ISD::SRA: {
5112 break;
5113
5114 return performSraCombine(N, DCI);
5115 }
5116 case ISD::TRUNCATE:
5117 return performTruncateCombine(N, DCI);
5118 case ISD::MUL:
5119 return performMulCombine(N, DCI);
5120 case AMDGPUISD::MUL_U24:
5121 case AMDGPUISD::MUL_I24: {
5122 if (SDValue Simplified = simplifyMul24(N, DCI))
5123 return Simplified;
5124 break;
5125 }
5128 return simplifyMul24(N, DCI);
5129 case ISD::SMUL_LOHI:
5130 case ISD::UMUL_LOHI:
5131 return performMulLoHiCombine(N, DCI);
5132 case ISD::MULHS:
5133 return performMulhsCombine(N, DCI);
5134 case ISD::MULHU:
5135 return performMulhuCombine(N, DCI);
5136 case ISD::SELECT:
5137 return performSelectCombine(N, DCI);
5138 case ISD::FNEG:
5139 return performFNegCombine(N, DCI);
5140 case ISD::FABS:
5141 return performFAbsCombine(N, DCI);
5142 case AMDGPUISD::BFE_I32:
5143 case AMDGPUISD::BFE_U32: {
5144 assert(!N->getValueType(0).isVector() &&
5145 "Vector handling of BFE not implemented");
5146 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5147 if (!Width)
5148 break;
5149
5150 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5151 if (WidthVal == 0)
5152 return DAG.getConstant(0, DL, MVT::i32);
5153
5154 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5155 if (!Offset)
5156 break;
5157
5158 SDValue BitsFrom = N->getOperand(0);
5159 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5160
5161 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5162
5163 if (OffsetVal == 0) {
5164 // This is already sign / zero extended, so try to fold away extra BFEs.
5165 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5166
5167 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5168 if (OpSignBits >= SignBits)
5169 return BitsFrom;
5170
5171 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5172 if (Signed) {
5173 // This is a sign_extend_inreg. Replace it to take advantage of existing
5174 // DAG Combines. If not eliminated, we will match back to BFE during
5175 // selection.
5176
5177 // TODO: The sext_inreg of extended types ends, although we can could
5178 // handle them in a single BFE.
5179 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5180 DAG.getValueType(SmallVT));
5181 }
5182
5183 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5184 }
5185
5186 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5187 if (Signed) {
5188 return constantFoldBFE<int32_t>(DAG,
5189 CVal->getSExtValue(),
5190 OffsetVal,
5191 WidthVal,
5192 DL);
5193 }
5194
5195 return constantFoldBFE<uint32_t>(DAG,
5196 CVal->getZExtValue(),
5197 OffsetVal,
5198 WidthVal,
5199 DL);
5200 }
5201
5202 if ((OffsetVal + WidthVal) >= 32 &&
5203 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5204 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5205 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5206 BitsFrom, ShiftVal);
5207 }
5208
5209 if (BitsFrom.hasOneUse()) {
5210 APInt Demanded = APInt::getBitsSet(32,
5211 OffsetVal,
5212 OffsetVal + WidthVal);
5213
5214 KnownBits Known;
5216 !DCI.isBeforeLegalizeOps());
5217 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5218 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5219 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5220 DCI.CommitTargetLoweringOpt(TLO);
5221 }
5222 }
5223
5224 break;
5225 }
5226 case ISD::LOAD:
5227 return performLoadCombine(N, DCI);
5228 case ISD::STORE:
5229 return performStoreCombine(N, DCI);
5230 case AMDGPUISD::RCP:
5232 return performRcpCombine(N, DCI);
5233 case ISD::AssertZext:
5234 case ISD::AssertSext:
5235 return performAssertSZExtCombine(N, DCI);
5237 return performIntrinsicWOChainCombine(N, DCI);
5238 case AMDGPUISD::FMAD_FTZ: {
5239 SDValue N0 = N->getOperand(0);
5240 SDValue N1 = N->getOperand(1);
5241 SDValue N2 = N->getOperand(2);
5242 EVT VT = N->getValueType(0);
5243
5244 // FMAD_FTZ is a FMAD + flush denormals to zero.
5245 // We flush the inputs, the intermediate step, and the output.
5246 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5247 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5248 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5249 if (N0CFP && N1CFP && N2CFP) {
5250 const auto FTZ = [](const APFloat &V) {
5251 if (V.isDenormal()) {
5252 APFloat Zero(V.getSemantics(), 0);
5253 return V.isNegative() ? -Zero : Zero;
5254 }
5255 return V;
5256 };
5257
5258 APFloat V0 = FTZ(N0CFP->getValueAPF());
5259 APFloat V1 = FTZ(N1CFP->getValueAPF());
5260 APFloat V2 = FTZ(N2CFP->getValueAPF());
5262 V0 = FTZ(V0);
5264 return DAG.getConstantFP(FTZ(V0), DL, VT);
5265 }
5266 break;
5267 }
5268 }
5269 return SDValue();
5270}
5271
5272//===----------------------------------------------------------------------===//
5273// Helper functions
5274//===----------------------------------------------------------------------===//
5275
5277 const TargetRegisterClass *RC,
5278 Register Reg, EVT VT,
5279 const SDLoc &SL,
5280 bool RawReg) const {
5283 Register VReg;
5284
5285 if (!MRI.isLiveIn(Reg)) {
5286 VReg = MRI.createVirtualRegister(RC);
5287 MRI.addLiveIn(Reg, VReg);
5288 } else {
5289 VReg = MRI.getLiveInVirtReg(Reg);
5290 }
5291
5292 if (RawReg)
5293 return DAG.getRegister(VReg, VT);
5294
5295 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5296}
5297
5298// This may be called multiple times, and nothing prevents creating multiple
5299// objects at the same offset. See if we already defined this object.
5301 int64_t Offset) {
5302 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5303 if (MFI.getObjectOffset(I) == Offset) {
5304 assert(MFI.getObjectSize(I) == Size);
5305 return I;
5306 }
5307 }
5308
5309 return MFI.CreateFixedObject(Size, Offset, true);
5310}
5311
5313 EVT VT,
5314 const SDLoc &SL,
5315 int64_t Offset) const {
5317 MachineFrameInfo &MFI = MF.getFrameInfo();
5318 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5319
5320 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5321 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5322
5323 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5326}
5327
5329 const SDLoc &SL,
5330 SDValue Chain,
5331 SDValue ArgVal,
5332 int64_t Offset) const {
5336
5337 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5338 // Stores to the argument stack area are relative to the stack pointer.
5339 SDValue SP =
5340 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5341 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5342 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5344 return Store;
5345}
5346
5348 const TargetRegisterClass *RC,
5349 EVT VT, const SDLoc &SL,
5350 const ArgDescriptor &Arg) const {
5351 assert(Arg && "Attempting to load missing argument");
5352
5353 SDValue V = Arg.isRegister() ?
5354 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5355 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5356
5357 if (!Arg.isMasked())
5358 return V;
5359
5360 unsigned Mask = Arg.getMask();
5361 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5362 V = DAG.getNode(ISD::SRL, SL, VT, V,
5363 DAG.getShiftAmountConstant(Shift, VT, SL));
5364 return DAG.getNode(ISD::AND, SL, VT, V,
5365 DAG.getConstant(Mask >> Shift, SL, VT));
5366}
5367
5369 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5370 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5371 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5372 uint64_t ArgOffset =
5373 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5374 switch (Param) {
5375 case FIRST_IMPLICIT:
5376 return ArgOffset;
5377 case PRIVATE_BASE:
5379 case SHARED_BASE:
5380 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5381 case QUEUE_PTR:
5382 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5383 }
5384 llvm_unreachable("unexpected implicit parameter type");
5385}
5386
5388 const MachineFunction &MF, const ImplicitParameter Param) const {
5391}
5392
5393#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5394
5395const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5396 switch ((AMDGPUISD::NodeType)Opcode) {
5397 case AMDGPUISD::FIRST_NUMBER: break;
5398 // AMDIL DAG nodes
5399 NODE_NAME_CASE(UMUL);
5400 NODE_NAME_CASE(BRANCH_COND);
5401
5402 // AMDGPU DAG nodes
5403 NODE_NAME_CASE(IF)
5404 NODE_NAME_CASE(ELSE)
5405 NODE_NAME_CASE(LOOP)
5406 NODE_NAME_CASE(CALL)
5407 NODE_NAME_CASE(TC_RETURN)
5408 NODE_NAME_CASE(TC_RETURN_GFX)
5409 NODE_NAME_CASE(TC_RETURN_CHAIN)
5410 NODE_NAME_CASE(TRAP)
5411 NODE_NAME_CASE(RET_GLUE)
5412 NODE_NAME_CASE(WAVE_ADDRESS)
5413 NODE_NAME_CASE(RETURN_TO_EPILOG)
5414 NODE_NAME_CASE(ENDPGM)
5415 NODE_NAME_CASE(ENDPGM_TRAP)
5416 NODE_NAME_CASE(SIMULATED_TRAP)
5417 NODE_NAME_CASE(DWORDADDR)
5418 NODE_NAME_CASE(FRACT)
5419 NODE_NAME_CASE(SETCC)
5420 NODE_NAME_CASE(SETREG)
5421 NODE_NAME_CASE(DENORM_MODE)
5422 NODE_NAME_CASE(FMA_W_CHAIN)
5423 NODE_NAME_CASE(FMUL_W_CHAIN)
5424 NODE_NAME_CASE(CLAMP)
5425 NODE_NAME_CASE(COS_HW)
5426 NODE_NAME_CASE(SIN_HW)
5427 NODE_NAME_CASE(FMAX_LEGACY)
5428 NODE_NAME_CASE(FMIN_LEGACY)
5429 NODE_NAME_CASE(FMAX3)
5430 NODE_NAME_CASE(SMAX3)
5431 NODE_NAME_CASE(UMAX3)
5432 NODE_NAME_CASE(FMIN3)
5433 NODE_NAME_CASE(SMIN3)
5434 NODE_NAME_CASE(UMIN3)
5435 NODE_NAME_CASE(FMED3)
5436 NODE_NAME_CASE(SMED3)
5437 NODE_NAME_CASE(UMED3)
5438 NODE_NAME_CASE(FMAXIMUM3)
5439 NODE_NAME_CASE(FMINIMUM3)
5440 NODE_NAME_CASE(FDOT2)
5441 NODE_NAME_CASE(URECIP)
5442 NODE_NAME_CASE(DIV_SCALE)
5443 NODE_NAME_CASE(DIV_FMAS)
5444 NODE_NAME_CASE(DIV_FIXUP)
5445 NODE_NAME_CASE(FMAD_FTZ)
5446 NODE_NAME_CASE(RCP)
5447 NODE_NAME_CASE(RSQ)
5448 NODE_NAME_CASE(RCP_LEGACY)
5449 NODE_NAME_CASE(RCP_IFLAG)
5450 NODE_NAME_CASE(LOG)
5451 NODE_NAME_CASE(EXP)
5452 NODE_NAME_CASE(FMUL_LEGACY)
5453 NODE_NAME_CASE(RSQ_CLAMP)
5454 NODE_NAME_CASE(FP_CLASS)
5455 NODE_NAME_CASE(DOT4)
5456 NODE_NAME_CASE(CARRY)
5457 NODE_NAME_CASE(BORROW)
5458 NODE_NAME_CASE(BFE_U32)
5459 NODE_NAME_CASE(BFE_I32)
5460 NODE_NAME_CASE(BFI)
5461 NODE_NAME_CASE(BFM)
5462 NODE_NAME_CASE(FFBH_U32)
5463 NODE_NAME_CASE(FFBH_I32)
5464 NODE_NAME_CASE(FFBL_B32)
5465 NODE_NAME_CASE(MUL_U24)
5466 NODE_NAME_CASE(MUL_I24)
5467 NODE_NAME_CASE(MULHI_U24)
5468 NODE_NAME_CASE(MULHI_I24)
5469 NODE_NAME_CASE(MAD_U24)
5470 NODE_NAME_CASE(MAD_I24)
5471 NODE_NAME_CASE(MAD_I64_I32)
5472 NODE_NAME_CASE(MAD_U64_U32)
5473 NODE_NAME_CASE(PERM)
5474 NODE_NAME_CASE(TEXTURE_FETCH)
5475 NODE_NAME_CASE(R600_EXPORT)
5476 NODE_NAME_CASE(CONST_ADDRESS)
5477 NODE_NAME_CASE(REGISTER_LOAD)
5478 NODE_NAME_CASE(REGISTER_STORE)
5479 NODE_NAME_CASE(SAMPLE)
5480 NODE_NAME_CASE(SAMPLEB)
5481 NODE_NAME_CASE(SAMPLED)
5482 NODE_NAME_CASE(SAMPLEL)
5483 NODE_NAME_CASE(CVT_F32_UBYTE0)
5484 NODE_NAME_CASE(CVT_F32_UBYTE1)
5485 NODE_NAME_CASE(CVT_F32_UBYTE2)
5486 NODE_NAME_CASE(CVT_F32_UBYTE3)
5487 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5488 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5489 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5490 NODE_NAME_CASE(CVT_PK_I16_I32)
5491 NODE_NAME_CASE(CVT_PK_U16_U32)
5492 NODE_NAME_CASE(FP_TO_FP16)
5493 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5494 NODE_NAME_CASE(CONST_DATA_PTR)
5495 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5497 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5498 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5499 NODE_NAME_CASE(DUMMY_CHAIN)
5501 NODE_NAME_CASE(LOAD_D16_HI)
5502 NODE_NAME_CASE(LOAD_D16_LO)
5503 NODE_NAME_CASE(LOAD_D16_HI_I8)
5504 NODE_NAME_CASE(LOAD_D16_HI_U8)
5505 NODE_NAME_CASE(LOAD_D16_LO_I8)
5506 NODE_NAME_CASE(LOAD_D16_LO_U8)
5507 NODE_NAME_CASE(STORE_MSKOR)
5508 NODE_NAME_CASE(LOAD_CONSTANT)
5509 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5510 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5511 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5512 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5513 NODE_NAME_CASE(DS_ORDERED_COUNT)
5514 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5515 NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
5516 NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
5517 NODE_NAME_CASE(BUFFER_LOAD)
5518 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5519 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5520 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5521 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5522 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5523 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5524 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5525 NODE_NAME_CASE(SBUFFER_LOAD)
5526 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5527 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5528 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5529 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5530 NODE_NAME_CASE(BUFFER_STORE)
5531 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5532 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5533 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5534 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5535 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5536 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5537 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5538 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5539 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5540 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5541 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5542 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5543 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5544 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5545 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5546 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5547 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5548 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5549 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5550 NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
5551 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5552 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5553 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5554
5556 }
5557 return nullptr;
5558}
5559
5561 SelectionDAG &DAG, int Enabled,
5562 int &RefinementSteps,
5563 bool &UseOneConstNR,
5564 bool Reciprocal) const {
5565 EVT VT = Operand.getValueType();
5566
5567 if (VT == MVT::f32) {
5568 RefinementSteps = 0;
5569 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5570 }
5571
5572 // TODO: There is also f64 rsq instruction, but the documentation is less
5573 // clear on its precision.
5574
5575 return SDValue();
5576}
5577
5579 SelectionDAG &DAG, int Enabled,
5580 int &RefinementSteps) const {
5581 EVT VT = Operand.getValueType();
5582
5583 if (VT == MVT::f32) {
5584 // Reciprocal, < 1 ulp error.
5585 //
5586 // This reciprocal approximation converges to < 0.5 ulp error with one
5587 // newton rhapson performed with two fused multiple adds (FMAs).
5588
5589 RefinementSteps = 0;
5590 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5591 }
5592
5593 // TODO: There is also f64 rcp instruction, but the documentation is less
5594 // clear on its precision.
5595
5596 return SDValue();
5597}
5598
5599static unsigned workitemIntrinsicDim(unsigned ID) {
5600 switch (ID) {
5601 case Intrinsic::amdgcn_workitem_id_x:
5602 return 0;
5603 case Intrinsic::amdgcn_workitem_id_y:
5604 return 1;
5605 case Intrinsic::amdgcn_workitem_id_z:
5606 return 2;
5607 default:
5608 llvm_unreachable("not a workitem intrinsic");
5609 }
5610}
5611
5613 const SDValue Op, KnownBits &Known,
5614 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5615
5616 Known.resetAll(); // Don't know anything.
5617
5618 unsigned Opc = Op.getOpcode();
5619
5620 switch (Opc) {
5621 default:
5622 break;
5623 case AMDGPUISD::CARRY:
5624 case AMDGPUISD::BORROW: {
5625 Known.Zero = APInt::getHighBitsSet(32, 31);
5626 break;
5627 }
5628
5629 case AMDGPUISD::BFE_I32:
5630 case AMDGPUISD::BFE_U32: {
5631 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5632 if (!CWidth)
5633 return;
5634
5635 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5636
5637 if (Opc == AMDGPUISD::BFE_U32)
5638 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5639
5640 break;
5641 }
5642 case AMDGPUISD::FP_TO_FP16: {
5643 unsigned BitWidth = Known.getBitWidth();
5644
5645 // High bits are zero.
5647 break;
5648 }
5649 case AMDGPUISD::MUL_U24:
5650 case AMDGPUISD::MUL_I24: {
5651 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5652 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5653 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5654 RHSKnown.countMinTrailingZeros();
5655 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5656 // Skip extra check if all bits are known zeros.
5657 if (TrailZ >= 32)
5658 break;
5659
5660 // Truncate to 24 bits.
5661 LHSKnown = LHSKnown.trunc(24);
5662 RHSKnown = RHSKnown.trunc(24);
5663
5664 if (Opc == AMDGPUISD::MUL_I24) {
5665 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5666 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5667 unsigned MaxValBits = LHSValBits + RHSValBits;
5668 if (MaxValBits > 32)
5669 break;
5670 unsigned SignBits = 32 - MaxValBits + 1;
5671 bool LHSNegative = LHSKnown.isNegative();
5672 bool LHSNonNegative = LHSKnown.isNonNegative();
5673 bool LHSPositive = LHSKnown.isStrictlyPositive();
5674 bool RHSNegative = RHSKnown.isNegative();
5675 bool RHSNonNegative = RHSKnown.isNonNegative();
5676 bool RHSPositive = RHSKnown.isStrictlyPositive();
5677
5678 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5679 Known.Zero.setHighBits(SignBits);
5680 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5681 Known.One.setHighBits(SignBits);
5682 } else {
5683 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5684 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5685 unsigned MaxValBits = LHSValBits + RHSValBits;
5686 if (MaxValBits >= 32)
5687 break;
5688 Known.Zero.setBitsFrom(MaxValBits);
5689 }
5690 break;
5691 }
5692 case AMDGPUISD::PERM: {
5693 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5694 if (!CMask)
5695 return;
5696
5697 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5698 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5699 unsigned Sel = CMask->getZExtValue();
5700
5701 for (unsigned I = 0; I < 32; I += 8) {
5702 unsigned SelBits = Sel & 0xff;
5703 if (SelBits < 4) {
5704 SelBits *= 8;
5705 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5706 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5707 } else if (SelBits < 7) {
5708 SelBits = (SelBits & 3) * 8;
5709 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5710 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5711 } else if (SelBits == 0x0c) {
5712 Known.Zero |= 0xFFull << I;
5713 } else if (SelBits > 0x0c) {
5714 Known.One |= 0xFFull << I;
5715 }
5716 Sel >>= 8;
5717 }
5718 break;
5719 }
5721 Known.Zero.setHighBits(24);
5722 break;
5723 }
5725 Known.Zero.setHighBits(16);
5726 break;
5727 }
5728 case AMDGPUISD::LDS: {
5729 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5730 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5731
5732 Known.Zero.setHighBits(16);
5733 Known.Zero.setLowBits(Log2(Alignment));
5734 break;
5735 }
5736 case AMDGPUISD::SMIN3:
5737 case AMDGPUISD::SMAX3:
5738 case AMDGPUISD::SMED3:
5739 case AMDGPUISD::UMIN3:
5740 case AMDGPUISD::UMAX3:
5741 case AMDGPUISD::UMED3: {
5742 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5743 if (Known2.isUnknown())
5744 break;
5745
5746 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5747 if (Known1.isUnknown())
5748 break;
5749
5750 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5751 if (Known0.isUnknown())
5752 break;
5753
5754 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5755 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5756 Known.One = Known0.One & Known1.One & Known2.One;
5757 break;
5758 }
5760 unsigned IID = Op.getConstantOperandVal(0);
5761 switch (IID) {
5762 case Intrinsic::amdgcn_workitem_id_x:
5763 case Intrinsic::amdgcn_workitem_id_y:
5764 case Intrinsic::amdgcn_workitem_id_z: {
5765 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5767 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5768 break;
5769 }
5770 default:
5771 break;
5772 }
5773 }
5774 }
5775}
5776
5778 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5779 unsigned Depth) const {
5780 switch (Op.getOpcode()) {
5781 case AMDGPUISD::BFE_I32: {
5782 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5783 if (!Width)
5784 return 1;
5785
5786 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5787 if (!isNullConstant(Op.getOperand(1)))
5788 return SignBits;
5789
5790 // TODO: Could probably figure something out with non-0 offsets.
5791 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5792 return std::max(SignBits, Op0SignBits);
5793 }
5794
5795 case AMDGPUISD::BFE_U32: {
5796 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5797 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5798 }
5799
5800 case AMDGPUISD::CARRY:
5801 case AMDGPUISD::BORROW:
5802 return 31;
5804 return 25;
5806 return 17;
5808 return 24;
5810 return 16;
5812 return 16;
5813 case AMDGPUISD::SMIN3:
5814 case AMDGPUISD::SMAX3:
5815 case AMDGPUISD::SMED3:
5816 case AMDGPUISD::UMIN3:
5817 case AMDGPUISD::UMAX3:
5818 case AMDGPUISD::UMED3: {
5819 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5820 if (Tmp2 == 1)
5821 return 1; // Early out.
5822
5823 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5824 if (Tmp1 == 1)
5825 return 1; // Early out.
5826
5827 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5828 if (Tmp0 == 1)
5829 return 1; // Early out.
5830
5831 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5832 }
5833 default:
5834 return 1;
5835 }
5836}
5837
5840 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5841 unsigned Depth) const {
5842 const MachineInstr *MI = MRI.getVRegDef(R);
5843 if (!MI)
5844 return 1;
5845
5846 // TODO: Check range metadata on MMO.
5847 switch (MI->getOpcode()) {
5848 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5849 return 25;
5850 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5851 return 17;
5852 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5853 return 24;
5854 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5855 return 16;
5856 case AMDGPU::G_AMDGPU_SMED3:
5857 case AMDGPU::G_AMDGPU_UMED3: {
5858 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5859 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5860 if (Tmp2 == 1)
5861 return 1;
5862 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5863 if (Tmp1 == 1)
5864 return 1;
5865 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5866 if (Tmp0 == 1)
5867 return 1;
5868 return std::min(Tmp0, std::min(Tmp1, Tmp2));
5869 }
5870 default:
5871 return 1;
5872 }
5873}
5874
5876 const SelectionDAG &DAG,
5877 bool SNaN,
5878 unsigned Depth) const {
5879 unsigned Opcode = Op.getOpcode();
5880 switch (Opcode) {
5883 if (SNaN)
5884 return true;
5885
5886 // TODO: Can check no nans on one of the operands for each one, but which
5887 // one?
5888 return false;
5889 }
5892 if (SNaN)
5893 return true;
5894 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5895 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5896 }
5897 case AMDGPUISD::FMED3:
5898 case AMDGPUISD::FMIN3:
5899 case AMDGPUISD::FMAX3:
5902 case AMDGPUISD::FMAD_FTZ: {
5903 if (SNaN)
5904 return true;
5905 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5906 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5907 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5908 }
5913 return true;
5914
5915 case AMDGPUISD::RCP:
5916 case AMDGPUISD::RSQ:
5918 case AMDGPUISD::RSQ_CLAMP: {
5919 if (SNaN)
5920 return true;
5921
5922 // TODO: Need is known positive check.
5923 return false;
5924 }
5925 case ISD::FLDEXP:
5926 case AMDGPUISD::FRACT: {
5927 if (SNaN)
5928 return true;
5929 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5930 }
5934 // TODO: Refine on operands.
5935 return SNaN;
5936 case AMDGPUISD::SIN_HW:
5937 case AMDGPUISD::COS_HW: {
5938 // TODO: Need check for infinity
5939 return SNaN;
5940 }
5942 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5943 // TODO: Handle more intrinsics
5944 switch (IntrinsicID) {
5945 case Intrinsic::amdgcn_cubeid:
5946 return true;
5947
5948 case Intrinsic::amdgcn_frexp_mant: {
5949 if (SNaN)
5950 return true;
5951 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5952 }
5953 case Intrinsic::amdgcn_cvt_pkrtz: {
5954 if (SNaN)
5955 return true;
5956 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5957 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5958 }
5959 case Intrinsic::amdgcn_rcp:
5960 case Intrinsic::amdgcn_rsq:
5961 case Intrinsic::amdgcn_rcp_legacy:
5962 case Intrinsic::amdgcn_rsq_legacy:
5963 case Intrinsic::amdgcn_rsq_clamp: {
5964 if (SNaN)
5965 return true;
5966
5967 // TODO: Need is known positive check.
5968 return false;
5969 }
5970 case Intrinsic::amdgcn_trig_preop:
5971 case Intrinsic::amdgcn_fdot2:
5972 // TODO: Refine on operand
5973 return SNaN;
5974 case Intrinsic::amdgcn_fma_legacy:
5975 if (SNaN)
5976 return true;
5977 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5978 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5979 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5980 default:
5981 return false;
5982 }
5983 }
5984 default:
5985 return false;
5986 }
5987}
5988
5990 Register N0, Register N1) const {
5991 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
5992}
5993
5996 switch (RMW->getOperation()) {
6003 case AtomicRMWInst::Xchg: {
6004 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
6005 unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
6006 if (ValSize == 32 || ValSize == 64)
6009 }
6010 default: {
6011 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
6012 unsigned Size = IntTy->getBitWidth();
6013 if (Size == 32 || Size == 64)
6015 }
6016
6018 }
6019 }
6020}
6021
6022/// Whether it is profitable to sink the operands of an
6023/// Instruction I to the basic block of I.
6024/// This helps using several modifiers (like abs and neg) more often.
6026 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6027 using namespace PatternMatch;
6028
6029 for (auto &Op : I->operands()) {
6030 // Ensure we are not already sinking this operand.
6031 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
6032 continue;
6033
6034 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
6035 Ops.push_back(&Op);
6036 }
6037
6038 return !Ops.empty();
6039}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_READONLY
Definition: Compiler.h:227
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1260
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1042
const fltSemantics & getSemantics() const
Definition: APFloat.h:1303
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1060
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1026
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ FSub
*p = old - v
Definition: Instructions.h:788
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
BinOp getOperation() const
Definition: Instructions.h:845
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for unsupported feature in backend.
iterator_range< arg_iterator > args()
Definition: Function.h:842
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getRegister(unsigned Reg, EVT VT)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes=true) const
Returns the type for the shift amount of a shift opcode.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:270
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:986
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:914
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:501
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:508
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:223
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:985
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1104
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:280
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:945
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1101
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double ln2
Definition: MathExtras.h:33
constexpr double ln10
Definition: MathExtras.h:34
constexpr float log2ef
Definition: MathExtras.h:50
constexpr double log2e
Definition: MathExtras.h:35
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool getAlign(const Function &F, unsigned index, unsigned &align)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1387
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:462
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:141
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:298
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:110
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:265
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...