LLVM 19.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <bitset>
94#include <cassert>
95#include <cctype>
96#include <cstdint>
97#include <cstdlib>
98#include <iterator>
99#include <limits>
100#include <optional>
101#include <tuple>
102#include <utility>
103#include <vector>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107
108#define DEBUG_TYPE "aarch64-lower"
109
110STATISTIC(NumTailCalls, "Number of tail calls");
111STATISTIC(NumShiftInserts, "Number of vector shift inserts");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148/// Value type used for condition codes.
149static const MVT MVT_CC = MVT::i32;
150
151static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
152 AArch64::X3, AArch64::X4, AArch64::X5,
153 AArch64::X6, AArch64::X7};
154static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
155 AArch64::Q3, AArch64::Q4, AArch64::Q5,
156 AArch64::Q6, AArch64::Q7};
157
159
161
162static inline EVT getPackedSVEVectorVT(EVT VT) {
163 switch (VT.getSimpleVT().SimpleTy) {
164 default:
165 llvm_unreachable("unexpected element type for vector");
166 case MVT::i8:
167 return MVT::nxv16i8;
168 case MVT::i16:
169 return MVT::nxv8i16;
170 case MVT::i32:
171 return MVT::nxv4i32;
172 case MVT::i64:
173 return MVT::nxv2i64;
174 case MVT::f16:
175 return MVT::nxv8f16;
176 case MVT::f32:
177 return MVT::nxv4f32;
178 case MVT::f64:
179 return MVT::nxv2f64;
180 case MVT::bf16:
181 return MVT::nxv8bf16;
182 }
183}
184
185// NOTE: Currently there's only a need to return integer vector types. If this
186// changes then just add an extra "type" parameter.
188 switch (EC.getKnownMinValue()) {
189 default:
190 llvm_unreachable("unexpected element count for vector");
191 case 16:
192 return MVT::nxv16i8;
193 case 8:
194 return MVT::nxv8i16;
195 case 4:
196 return MVT::nxv4i32;
197 case 2:
198 return MVT::nxv2i64;
199 }
200}
201
203 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
204 "Expected scalable predicate vector type!");
205 switch (VT.getVectorMinNumElements()) {
206 default:
207 llvm_unreachable("unexpected element count for vector");
208 case 2:
209 return MVT::nxv2i64;
210 case 4:
211 return MVT::nxv4i32;
212 case 8:
213 return MVT::nxv8i16;
214 case 16:
215 return MVT::nxv16i8;
216 }
217}
218
219/// Returns true if VT's elements occupy the lowest bit positions of its
220/// associated register class without any intervening space.
221///
222/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
223/// same register class, but only nxv8f16 can be treated as a packed vector.
224static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
226 "Expected legal vector type!");
227 return VT.isFixedLengthVector() ||
229}
230
231// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
232// predicate and end with a passthru value matching the result type.
233static bool isMergePassthruOpcode(unsigned Opc) {
234 switch (Opc) {
235 default:
236 return false;
266 return true;
267 }
268}
269
270// Returns true if inactive lanes are known to be zeroed by construction.
272 switch (Op.getOpcode()) {
273 default:
274 return false;
275 // We guarantee i1 splat_vectors to zero the other lanes
279 return true;
281 switch (Op.getConstantOperandVal(0)) {
282 default:
283 return false;
284 case Intrinsic::aarch64_sve_ptrue:
285 case Intrinsic::aarch64_sve_pnext:
286 case Intrinsic::aarch64_sve_cmpeq:
287 case Intrinsic::aarch64_sve_cmpne:
288 case Intrinsic::aarch64_sve_cmpge:
289 case Intrinsic::aarch64_sve_cmpgt:
290 case Intrinsic::aarch64_sve_cmphs:
291 case Intrinsic::aarch64_sve_cmphi:
292 case Intrinsic::aarch64_sve_cmpeq_wide:
293 case Intrinsic::aarch64_sve_cmpne_wide:
294 case Intrinsic::aarch64_sve_cmpge_wide:
295 case Intrinsic::aarch64_sve_cmpgt_wide:
296 case Intrinsic::aarch64_sve_cmplt_wide:
297 case Intrinsic::aarch64_sve_cmple_wide:
298 case Intrinsic::aarch64_sve_cmphs_wide:
299 case Intrinsic::aarch64_sve_cmphi_wide:
300 case Intrinsic::aarch64_sve_cmplo_wide:
301 case Intrinsic::aarch64_sve_cmpls_wide:
302 case Intrinsic::aarch64_sve_fcmpeq:
303 case Intrinsic::aarch64_sve_fcmpne:
304 case Intrinsic::aarch64_sve_fcmpge:
305 case Intrinsic::aarch64_sve_fcmpgt:
306 case Intrinsic::aarch64_sve_fcmpuo:
307 case Intrinsic::aarch64_sve_facgt:
308 case Intrinsic::aarch64_sve_facge:
309 case Intrinsic::aarch64_sve_whilege:
310 case Intrinsic::aarch64_sve_whilegt:
311 case Intrinsic::aarch64_sve_whilehi:
312 case Intrinsic::aarch64_sve_whilehs:
313 case Intrinsic::aarch64_sve_whilele:
314 case Intrinsic::aarch64_sve_whilelo:
315 case Intrinsic::aarch64_sve_whilels:
316 case Intrinsic::aarch64_sve_whilelt:
317 case Intrinsic::aarch64_sve_match:
318 case Intrinsic::aarch64_sve_nmatch:
319 case Intrinsic::aarch64_sve_whilege_x2:
320 case Intrinsic::aarch64_sve_whilegt_x2:
321 case Intrinsic::aarch64_sve_whilehi_x2:
322 case Intrinsic::aarch64_sve_whilehs_x2:
323 case Intrinsic::aarch64_sve_whilele_x2:
324 case Intrinsic::aarch64_sve_whilelo_x2:
325 case Intrinsic::aarch64_sve_whilels_x2:
326 case Intrinsic::aarch64_sve_whilelt_x2:
327 return true;
328 }
329 }
330}
331
333 const AArch64Subtarget &STI)
334 : TargetLowering(TM), Subtarget(&STI) {
335 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
336 // we have to make something up. Arbitrarily, choose ZeroOrOne.
338 // When comparing vectors the result sets the different elements in the
339 // vector to all-one or all-zero.
341
342 // Set up the register classes.
343 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
344 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
345
346 if (Subtarget->hasLS64()) {
347 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
348 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
350 }
351
352 if (Subtarget->hasFPARMv8()) {
353 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
354 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
355 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
356 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
357 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
358 }
359
360 if (Subtarget->hasNEON()) {
361 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
362 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
363 // Someone set us up the NEON.
364 addDRTypeForNEON(MVT::v2f32);
365 addDRTypeForNEON(MVT::v8i8);
366 addDRTypeForNEON(MVT::v4i16);
367 addDRTypeForNEON(MVT::v2i32);
368 addDRTypeForNEON(MVT::v1i64);
369 addDRTypeForNEON(MVT::v1f64);
370 addDRTypeForNEON(MVT::v4f16);
371 addDRTypeForNEON(MVT::v4bf16);
372
373 addQRTypeForNEON(MVT::v4f32);
374 addQRTypeForNEON(MVT::v2f64);
375 addQRTypeForNEON(MVT::v16i8);
376 addQRTypeForNEON(MVT::v8i16);
377 addQRTypeForNEON(MVT::v4i32);
378 addQRTypeForNEON(MVT::v2i64);
379 addQRTypeForNEON(MVT::v8f16);
380 addQRTypeForNEON(MVT::v8bf16);
381 }
382
383 if (Subtarget->hasSVEorSME()) {
384 // Add legal sve predicate types
385 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
386 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
387 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
388 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
389 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
390
391 // Add legal sve data types
392 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
393 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
394 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
395 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
396
397 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
398 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
399 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
400 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
401 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
402 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
403
404 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
405 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
406 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
407
408 if (Subtarget->useSVEForFixedLengthVectors()) {
411 addRegisterClass(VT, &AArch64::ZPRRegClass);
412
415 addRegisterClass(VT, &AArch64::ZPRRegClass);
416 }
417 }
418
419 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
420 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
421 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
422 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
423
424 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
426 }
427
428 // Compute derived properties from the register classes
430
431 // Provide all sorts of operation actions
470
474
478
480
481 // Custom lowering hooks are needed for XOR
482 // to fold it into CSINC/CSINV.
485
486 // Virtually no operation on f128 is legal, but LLVM can't expand them when
487 // there's a valid register class, so we need custom operations in most cases.
511 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
512 // aren't handled.
513
514 // Lowering for many of the conversions is actually specified by the non-f128
515 // type. The LowerXXX function will be trivial when f128 isn't involved.
540 if (Subtarget->hasFPARMv8()) {
543 }
546 if (Subtarget->hasFPARMv8()) {
549 }
552
557
558 // Variable arguments.
563
564 // Variable-sized objects.
567
568 // Lowering Funnel Shifts to EXTR
573
575
576 // Constant pool entries
578
579 // BlockAddress
581
582 // AArch64 lacks both left-rotate and popcount instructions.
588 }
589
590 // AArch64 doesn't have i32 MULH{S|U}.
593
594 // AArch64 doesn't have {U|S}MUL_LOHI.
599
600 if (Subtarget->hasCSSC()) {
604
606
610
613
618
623 } else {
627
630
633 }
634
640 }
647
648 // Custom lower Add/Sub/Mul with overflow.
661
670
679 if (Subtarget->hasFullFP16()) {
682 } else {
685 }
686
687 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
695 setOperationAction(Op, MVT::f16, Promote);
696 setOperationAction(Op, MVT::v4f16, Expand);
697 setOperationAction(Op, MVT::v8f16, Expand);
698 setOperationAction(Op, MVT::bf16, Promote);
699 setOperationAction(Op, MVT::v4bf16, Expand);
700 setOperationAction(Op, MVT::v8bf16, Expand);
701 }
702
703 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
704 for (auto Op : {
708 ISD::FADD,
709 ISD::FSUB,
710 ISD::FMUL,
711 ISD::FDIV,
712 ISD::FMA,
742 })
743 setOperationAction(Op, ScalarVT, Promote);
744
745 for (auto Op : {ISD::FNEG, ISD::FABS})
746 setOperationAction(Op, ScalarVT, Legal);
747
748 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
749 // because the result type is integer.
753 setOperationAction(Op, ScalarVT, Custom);
754
755 // promote v4f16 to v4f32 when that is known to be safe.
756 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
757 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
758 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
759 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
760 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
761 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
762 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
763 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
764 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
765 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
766 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
767 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
768
778
779 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
801 };
802
803 if (!Subtarget->hasFullFP16()) {
804 LegalizeNarrowFP(MVT::f16);
805 }
806 LegalizeNarrowFP(MVT::bf16);
809
810 // AArch64 has implementations of a lot of rounding-like FP operations.
811 for (auto Op :
822 for (MVT Ty : {MVT::f32, MVT::f64})
824 if (Subtarget->hasFullFP16())
825 setOperationAction(Op, MVT::f16, Legal);
826 }
827
828 // Basic strict FP operations are legal
831 for (MVT Ty : {MVT::f32, MVT::f64})
833 if (Subtarget->hasFullFP16())
834 setOperationAction(Op, MVT::f16, Legal);
835 }
836
837 // Strict conversion to a larger type is legal
838 for (auto VT : {MVT::f32, MVT::f64})
840
842
848
850 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
853 } else {
856 }
859
860 // Generate outline atomics library calls only if LSE was not specified for
861 // subtarget
862 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
888#define LCALLNAMES(A, B, N) \
889 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
890 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
891 setLibcallName(A##N##_REL, #B #N "_rel"); \
892 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
893#define LCALLNAME4(A, B) \
894 LCALLNAMES(A, B, 1) \
895 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
896#define LCALLNAME5(A, B) \
897 LCALLNAMES(A, B, 1) \
898 LCALLNAMES(A, B, 2) \
899 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
900 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
901 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
902 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
903 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
904 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
905 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
906#undef LCALLNAMES
907#undef LCALLNAME4
908#undef LCALLNAME5
909 }
910
911 if (Subtarget->hasLSE128()) {
912 // Custom lowering because i128 is not legal. Must be replaced by 2x64
913 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
917 }
918
919 // 128-bit loads and stores can be done without expanding
922
923 // Aligned 128-bit loads and stores are single-copy atomic according to the
924 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
925 if (Subtarget->hasLSE2()) {
928 }
929
930 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
931 // custom lowering, as there are no un-paired non-temporal stores and
932 // legalization will break up 256 bit inputs.
934 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
935 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
936 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
941
942 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
943 // custom lowering, as there are no un-paired non-temporal loads legalization
944 // will break up 256 bit inputs.
945 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
946 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
947 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
948 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
949 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
950 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
951 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
952 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
953
954 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
956
957 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
958 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
959 // Issue __sincos_stret if available.
962 } else {
965 }
966
967 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
968 // MSVCRT doesn't have powi; fall back to pow
969 setLibcallName(RTLIB::POWI_F32, nullptr);
970 setLibcallName(RTLIB::POWI_F64, nullptr);
971 }
972
973 // Make floating-point constants legal for the large code model, so they don't
974 // become loads from the constant pool.
975 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
978 }
979
980 // AArch64 does not have floating-point extending loads, i1 sign-extending
981 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
982 for (MVT VT : MVT::fp_valuetypes()) {
983 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
984 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
985 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
986 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
987 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
988 }
989 for (MVT VT : MVT::integer_valuetypes())
991
992 for (MVT WideVT : MVT::fp_valuetypes()) {
993 for (MVT NarrowVT : MVT::fp_valuetypes()) {
994 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
995 setTruncStoreAction(WideVT, NarrowVT, Expand);
996 }
997 }
998 }
999
1000 if (Subtarget->hasFPARMv8()) {
1004 }
1005
1006 // Indexed loads and stores are supported.
1007 for (unsigned im = (unsigned)ISD::PRE_INC;
1009 setIndexedLoadAction(im, MVT::i8, Legal);
1010 setIndexedLoadAction(im, MVT::i16, Legal);
1011 setIndexedLoadAction(im, MVT::i32, Legal);
1012 setIndexedLoadAction(im, MVT::i64, Legal);
1013 setIndexedLoadAction(im, MVT::f64, Legal);
1014 setIndexedLoadAction(im, MVT::f32, Legal);
1015 setIndexedLoadAction(im, MVT::f16, Legal);
1016 setIndexedLoadAction(im, MVT::bf16, Legal);
1017 setIndexedStoreAction(im, MVT::i8, Legal);
1018 setIndexedStoreAction(im, MVT::i16, Legal);
1019 setIndexedStoreAction(im, MVT::i32, Legal);
1020 setIndexedStoreAction(im, MVT::i64, Legal);
1021 setIndexedStoreAction(im, MVT::f64, Legal);
1022 setIndexedStoreAction(im, MVT::f32, Legal);
1023 setIndexedStoreAction(im, MVT::f16, Legal);
1024 setIndexedStoreAction(im, MVT::bf16, Legal);
1025 }
1026
1027 // Trap.
1028 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1031
1032 // We combine OR nodes for bitfield operations.
1034 // Try to create BICs for vector ANDs.
1036
1037 // Vector add and sub nodes may conceal a high-half opportunity.
1038 // Also, try to fold ADD into CSINC/CSINV..
1041
1044
1045 // Try and combine setcc with csel
1047
1049
1056
1058
1060
1062
1066
1068
1070
1072
1074
1078
1080
1081 // In case of strict alignment, avoid an excessive number of byte wide stores.
1084 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1085
1089 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1090
1093
1096 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1097
1099
1101
1102 EnableExtLdPromotion = true;
1103
1104 // Set required alignment.
1106 // Set preferred alignments.
1107
1108 // Don't align loops on Windows. The SEH unwind info generation needs to
1109 // know the exact length of functions before the alignments have been
1110 // expanded.
1111 if (!Subtarget->isTargetWindows())
1115
1116 // Only change the limit for entries in a jump table if specified by
1117 // the sub target, but not at the command line.
1118 unsigned MaxJT = STI.getMaximumJumpTableSize();
1119 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1121
1123
1125
1127
1128 if (Subtarget->hasNEON()) {
1129 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1130 // silliness like this:
1131 for (auto Op :
1149 setOperationAction(Op, MVT::v1f64, Expand);
1150
1151 for (auto Op :
1156 setOperationAction(Op, MVT::v1i64, Expand);
1157
1158 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1159 // elements smaller than i32, so promote the input to i32 first.
1160 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1161 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1162
1163 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1164 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1165 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1168 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1170
1171 if (Subtarget->hasFullFP16()) {
1174
1183 } else {
1184 // when AArch64 doesn't have fullfp16 support, promote the input
1185 // to i32 first.
1186 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1187 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1188 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1189 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1190 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1191 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1192 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1193 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1194 }
1195
1196 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1197 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1204 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1209 }
1210
1211 // Custom handling for some quad-vector types to detect MULL.
1212 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1213 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1214 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1215 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1216 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1217 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1218
1219 // Saturates
1220 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1221 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1226 }
1227
1228 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1229 MVT::v4i32}) {
1236 }
1237
1238 // Vector reductions
1239 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1240 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1241 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1246
1248 }
1249 }
1250 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1251 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1260 }
1265
1267 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1268 // Likewise, narrowing and extending vector loads/stores aren't handled
1269 // directly.
1272
1273 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1276 } else {
1279 }
1282
1285
1286 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1287 setTruncStoreAction(VT, InnerVT, Expand);
1288 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1289 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1290 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1291 }
1292 }
1293
1294 // AArch64 has implementations of a lot of rounding-like FP operations.
1295 for (auto Op :
1300 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1302 if (Subtarget->hasFullFP16())
1303 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1305 }
1306
1307 // LRINT and LLRINT.
1308 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1309 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1311 if (Subtarget->hasFullFP16())
1312 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1314 }
1315
1316 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1317
1322
1326
1327 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1328 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1329 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1330 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1331 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1332 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1333
1334 // ADDP custom lowering
1335 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1337 // FADDP custom lowering
1338 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1340 }
1341
1342 if (Subtarget->hasSME()) {
1344 }
1345
1346 // FIXME: Move lowering for more nodes here if those are common between
1347 // SVE and SME.
1348 if (Subtarget->hasSVEorSME()) {
1349 for (auto VT :
1350 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1355 }
1356 }
1357
1358 if (Subtarget->hasSVEorSME()) {
1359 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1402
1408
1417
1422
1423 if (!Subtarget->isLittleEndian())
1425
1426 if (Subtarget->hasSVE2orSME())
1427 // For SLI/SRI.
1429 }
1430
1431 // Illegal unpacked integer vector types.
1432 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1435 }
1436
1437 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1438 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1439 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1441
1442 for (auto VT :
1443 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1444 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1446
1447 for (auto VT :
1448 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1456
1460
1461 // There are no legal MVT::nxv16f## based types.
1462 if (VT != MVT::nxv16i1) {
1465 }
1466 }
1467
1468 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1469 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1470 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1471 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1476 }
1477
1478 // Firstly, exclude all scalable vector extending loads/truncating stores,
1479 // include both integer and floating scalable vector.
1481 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1482 setTruncStoreAction(VT, InnerVT, Expand);
1483 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1484 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1485 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1486 }
1487 }
1488
1489 // Then, selectively enable those which we directly support.
1490 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1491 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1492 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1493 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1494 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1495 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1496 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1497 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1498 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1499 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1500 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1501 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1502 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1503 }
1504
1505 // SVE supports truncating stores of 64 and 128-bit vectors
1506 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1507 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1508 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1509 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1510 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1511
1512 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1513 MVT::nxv4f32, MVT::nxv2f64}) {
1551 if (Subtarget->isSVEAvailable())
1556
1570
1582
1583 if (!Subtarget->isLittleEndian())
1585 }
1586
1587 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1595
1596 if (!Subtarget->isLittleEndian())
1598 }
1599
1602
1603 // NEON doesn't support integer divides, but SVE does
1604 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1605 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1608 }
1609
1610 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1611 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1612 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1613
1614 if (Subtarget->isSVEAvailable()) {
1615 // NEON doesn't support across-vector reductions, but SVE does.
1616 for (auto VT :
1617 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1619 }
1620
1621 // NOTE: Currently this has to happen after computeRegisterProperties rather
1622 // than the preferred option of combining it with the addRegisterClass call.
1623 if (Subtarget->useSVEForFixedLengthVectors()) {
1626 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1627 addTypeForFixedLengthSVE(VT);
1628 }
1631 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1632 addTypeForFixedLengthSVE(VT);
1633 }
1634
1635 // 64bit results can mean a bigger than NEON input.
1636 for (auto VT : {MVT::v8i8, MVT::v4i16})
1639
1640 // 128bit results imply a bigger than NEON input.
1641 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1643 for (auto VT : {MVT::v8f16, MVT::v4f32})
1645
1646 // These operations are not supported on NEON but SVE can do them.
1648 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1649 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1650 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1651 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1652 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1653 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1654 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1655 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1656 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1657 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1658 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1659 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1660 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1661 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1662 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1667
1668 // Int operations with no NEON support.
1669 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1670 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1678 }
1679
1680 // Use SVE for vectors with more than 2 elements.
1681 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1683 }
1684
1685 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1686 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1687 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1688 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1689
1691
1692 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1694 }
1695
1696 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1697 // Only required for llvm.aarch64.mops.memset.tag
1699 }
1700
1702
1703 if (Subtarget->hasSVE()) {
1708 }
1709
1710 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1711
1712 IsStrictFPEnabled = true;
1714
1715 if (Subtarget->isWindowsArm64EC()) {
1716 // FIXME: are there intrinsics we need to exclude from this?
1717 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1718 auto code = static_cast<RTLIB::Libcall>(i);
1719 auto libcallName = getLibcallName(code);
1720 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1721 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1722 }
1723 }
1724 }
1725}
1726
1727void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1728 assert(VT.isVector() && "VT should be a vector type");
1729
1730 if (VT.isFloatingPoint()) {
1732 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1733 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1734 }
1735
1736 // Mark vector float intrinsics as expand.
1737 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1747 }
1748
1749 // But we do support custom-lowering for FCOPYSIGN.
1750 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1751 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1752 VT == MVT::v8f16) &&
1753 Subtarget->hasFullFP16()))
1755
1768
1772 for (MVT InnerVT : MVT::all_valuetypes())
1773 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1774
1775 // CNT supports only B element sizes, then use UADDLP to widen.
1776 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1778
1784
1785 for (unsigned Opcode :
1788 setOperationAction(Opcode, VT, Custom);
1789
1790 if (!VT.isFloatingPoint())
1792
1793 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1794 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1795 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1796 setOperationAction(Opcode, VT, Legal);
1797
1798 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1799 // NEON types.
1800 if (VT.isFloatingPoint() &&
1801 VT.getVectorElementType() != MVT::bf16 &&
1802 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1803 for (unsigned Opcode :
1809 setOperationAction(Opcode, VT, Legal);
1810
1811 // Strict fp extend and trunc are legal
1812 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1814 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1816
1817 // FIXME: We could potentially make use of the vector comparison instructions
1818 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1819 // complications:
1820 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1821 // so we would need to expand when the condition code doesn't match the
1822 // kind of comparison.
1823 // * Some kinds of comparison require more than one FCMXY instruction so
1824 // would need to be expanded instead.
1825 // * The lowering of the non-strict versions involves target-specific ISD
1826 // nodes so we would likely need to add strict versions of all of them and
1827 // handle them appropriately.
1830
1831 if (Subtarget->isLittleEndian()) {
1832 for (unsigned im = (unsigned)ISD::PRE_INC;
1836 }
1837 }
1838
1839 if (Subtarget->hasD128()) {
1842 }
1843}
1844
1846 EVT OpVT) const {
1847 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1848 if (!Subtarget->hasSVE())
1849 return true;
1850
1851 // We can only support legal predicate result types. We can use the SVE
1852 // whilelo instruction for generating fixed-width predicates too.
1853 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1854 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1855 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1856 return true;
1857
1858 // The whilelo instruction only works with i32 or i64 scalar inputs.
1859 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1860 return true;
1861
1862 return false;
1863}
1864
1866 return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
1867}
1868
1869void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1870 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1871
1872 // By default everything must be expanded.
1873 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1875
1876 if (VT.isFloatingPoint()) {
1886 }
1887
1889 VT == MVT::v1f64 ? Expand : Custom;
1890
1891 // Mark integer truncating stores/extending loads as having custom lowering
1892 if (VT.isInteger()) {
1893 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1894 while (InnerVT != VT) {
1895 setTruncStoreAction(VT, InnerVT, Default);
1896 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
1897 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
1898 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1899 InnerVT = InnerVT.changeVectorElementType(
1900 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1901 }
1902 }
1903
1904 // Mark floating-point truncating stores/extending loads as having custom
1905 // lowering
1906 if (VT.isFloatingPoint()) {
1907 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1908 while (InnerVT != VT) {
1909 setTruncStoreAction(VT, InnerVT, Custom);
1910 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
1911 InnerVT = InnerVT.changeVectorElementType(
1913 }
1914 }
1915
1916 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
1917 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
1918
1919 // Lower fixed length vector operations to scalable equivalents.
1924 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
1961 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
1962 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
1964 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
1983 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2009}
2010
2011void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
2012 addRegisterClass(VT, &AArch64::FPR64RegClass);
2013 addTypeForNEON(VT);
2014}
2015
2016void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
2017 addRegisterClass(VT, &AArch64::FPR128RegClass);
2018 addTypeForNEON(VT);
2019}
2020
2022 LLVMContext &C, EVT VT) const {
2023 if (!VT.isVector())
2024 return MVT::i32;
2025 if (VT.isScalableVector())
2026 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2028}
2029
2030// isIntImmediate - This method tests to see if the node is a constant
2031// operand. If so Imm will receive the value.
2032static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2033 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2034 Imm = C->getZExtValue();
2035 return true;
2036 }
2037 return false;
2038}
2039
2040// isOpcWithIntImmediate - This method tests to see if the node is a specific
2041// opcode and that it has a immediate integer right operand.
2042// If so Imm will receive the value.
2043static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2044 uint64_t &Imm) {
2045 return N->getOpcode() == Opc &&
2046 isIntImmediate(N->getOperand(1).getNode(), Imm);
2047}
2048
2049static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2050 const APInt &Demanded,
2052 unsigned NewOpc) {
2053 uint64_t OldImm = Imm, NewImm, Enc;
2054 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2055
2056 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2057 // bimm64.
2058 if (Imm == 0 || Imm == Mask ||
2060 return false;
2061
2062 unsigned EltSize = Size;
2063 uint64_t DemandedBits = Demanded.getZExtValue();
2064
2065 // Clear bits that are not demanded.
2066 Imm &= DemandedBits;
2067
2068 while (true) {
2069 // The goal here is to set the non-demanded bits in a way that minimizes
2070 // the number of switching between 0 and 1. In order to achieve this goal,
2071 // we set the non-demanded bits to the value of the preceding demanded bits.
2072 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2073 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2074 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2075 // The final result is 0b11000011.
2076 uint64_t NonDemandedBits = ~DemandedBits;
2077 uint64_t InvertedImm = ~Imm & DemandedBits;
2078 uint64_t RotatedImm =
2079 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2080 NonDemandedBits;
2081 uint64_t Sum = RotatedImm + NonDemandedBits;
2082 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2083 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2084 NewImm = (Imm | Ones) & Mask;
2085
2086 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2087 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2088 // we halve the element size and continue the search.
2089 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2090 break;
2091
2092 // We cannot shrink the element size any further if it is 2-bits.
2093 if (EltSize == 2)
2094 return false;
2095
2096 EltSize /= 2;
2097 Mask >>= EltSize;
2098 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2099
2100 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2101 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2102 return false;
2103
2104 // Merge the upper and lower halves of Imm and DemandedBits.
2105 Imm |= Hi;
2106 DemandedBits |= DemandedBitsHi;
2107 }
2108
2109 ++NumOptimizedImms;
2110
2111 // Replicate the element across the register width.
2112 while (EltSize < Size) {
2113 NewImm |= NewImm << EltSize;
2114 EltSize *= 2;
2115 }
2116
2117 (void)OldImm;
2118 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2119 "demanded bits should never be altered");
2120 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2121
2122 // Create the new constant immediate node.
2123 EVT VT = Op.getValueType();
2124 SDLoc DL(Op);
2125 SDValue New;
2126
2127 // If the new constant immediate is all-zeros or all-ones, let the target
2128 // independent DAG combine optimize this node.
2129 if (NewImm == 0 || NewImm == OrigMask) {
2130 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2131 TLO.DAG.getConstant(NewImm, DL, VT));
2132 // Otherwise, create a machine node so that target independent DAG combine
2133 // doesn't undo this optimization.
2134 } else {
2136 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2137 New = SDValue(
2138 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2139 }
2140
2141 return TLO.CombineTo(Op, New);
2142}
2143
2145 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2146 TargetLoweringOpt &TLO) const {
2147 // Delay this optimization to as late as possible.
2148 if (!TLO.LegalOps)
2149 return false;
2150
2152 return false;
2153
2154 EVT VT = Op.getValueType();
2155 if (VT.isVector())
2156 return false;
2157
2158 unsigned Size = VT.getSizeInBits();
2159 assert((Size == 32 || Size == 64) &&
2160 "i32 or i64 is expected after legalization.");
2161
2162 // Exit early if we demand all bits.
2163 if (DemandedBits.popcount() == Size)
2164 return false;
2165
2166 unsigned NewOpc;
2167 switch (Op.getOpcode()) {
2168 default:
2169 return false;
2170 case ISD::AND:
2171 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2172 break;
2173 case ISD::OR:
2174 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2175 break;
2176 case ISD::XOR:
2177 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2178 break;
2179 }
2180 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2181 if (!C)
2182 return false;
2183 uint64_t Imm = C->getZExtValue();
2184 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2185}
2186
2187/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2188/// Mask are known to be either zero or one and return them Known.
2190 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2191 const SelectionDAG &DAG, unsigned Depth) const {
2192 switch (Op.getOpcode()) {
2193 default:
2194 break;
2195 case AArch64ISD::DUP: {
2196 SDValue SrcOp = Op.getOperand(0);
2197 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2198 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2199 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2200 "Expected DUP implicit truncation");
2201 Known = Known.trunc(Op.getScalarValueSizeInBits());
2202 }
2203 break;
2204 }
2205 case AArch64ISD::CSEL: {
2206 KnownBits Known2;
2207 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2208 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2209 Known = Known.intersectWith(Known2);
2210 break;
2211 }
2212 case AArch64ISD::BICi: {
2213 // Compute the bit cleared value.
2214 uint64_t Mask =
2215 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2216 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2217 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2218 break;
2219 }
2220 case AArch64ISD::VLSHR: {
2221 KnownBits Known2;
2222 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2223 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2224 Known = KnownBits::lshr(Known, Known2);
2225 break;
2226 }
2227 case AArch64ISD::VASHR: {
2228 KnownBits Known2;
2229 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2230 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2231 Known = KnownBits::ashr(Known, Known2);
2232 break;
2233 }
2234 case AArch64ISD::VSHL: {
2235 KnownBits Known2;
2236 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2237 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2238 Known = KnownBits::shl(Known, Known2);
2239 break;
2240 }
2241 case AArch64ISD::MOVI: {
2243 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2244 break;
2245 }
2247 case AArch64ISD::ADDlow: {
2248 if (!Subtarget->isTargetILP32())
2249 break;
2250 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2251 Known.Zero = APInt::getHighBitsSet(64, 32);
2252 break;
2253 }
2255 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2256 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2257 break;
2258 }
2260 Intrinsic::ID IntID =
2261 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2262 switch (IntID) {
2263 default: return;
2264 case Intrinsic::aarch64_ldaxr:
2265 case Intrinsic::aarch64_ldxr: {
2266 unsigned BitWidth = Known.getBitWidth();
2267 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2268 unsigned MemBits = VT.getScalarSizeInBits();
2269 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2270 return;
2271 }
2272 }
2273 break;
2274 }
2276 case ISD::INTRINSIC_VOID: {
2277 unsigned IntNo = Op.getConstantOperandVal(0);
2278 switch (IntNo) {
2279 default:
2280 break;
2281 case Intrinsic::aarch64_neon_uaddlv: {
2282 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2283 unsigned BitWidth = Known.getBitWidth();
2284 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2285 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2286 assert(BitWidth >= Bound && "Unexpected width!");
2288 Known.Zero |= Mask;
2289 }
2290 break;
2291 }
2292 case Intrinsic::aarch64_neon_umaxv:
2293 case Intrinsic::aarch64_neon_uminv: {
2294 // Figure out the datatype of the vector operand. The UMINV instruction
2295 // will zero extend the result, so we can mark as known zero all the
2296 // bits larger than the element datatype. 32-bit or larget doesn't need
2297 // this as those are legal types and will be handled by isel directly.
2298 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2299 unsigned BitWidth = Known.getBitWidth();
2300 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2301 assert(BitWidth >= 8 && "Unexpected width!");
2303 Known.Zero |= Mask;
2304 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2305 assert(BitWidth >= 16 && "Unexpected width!");
2307 Known.Zero |= Mask;
2308 }
2309 break;
2310 } break;
2311 }
2312 }
2313 }
2314}
2315
2317 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2318 unsigned Depth) const {
2319 EVT VT = Op.getValueType();
2320 unsigned VTBits = VT.getScalarSizeInBits();
2321 unsigned Opcode = Op.getOpcode();
2322 switch (Opcode) {
2323 case AArch64ISD::CMEQ:
2324 case AArch64ISD::CMGE:
2325 case AArch64ISD::CMGT:
2326 case AArch64ISD::CMHI:
2327 case AArch64ISD::CMHS:
2328 case AArch64ISD::FCMEQ:
2329 case AArch64ISD::FCMGE:
2330 case AArch64ISD::FCMGT:
2331 case AArch64ISD::CMEQz:
2332 case AArch64ISD::CMGEz:
2333 case AArch64ISD::CMGTz:
2334 case AArch64ISD::CMLEz:
2335 case AArch64ISD::CMLTz:
2336 case AArch64ISD::FCMEQz:
2337 case AArch64ISD::FCMGEz:
2338 case AArch64ISD::FCMGTz:
2339 case AArch64ISD::FCMLEz:
2340 case AArch64ISD::FCMLTz:
2341 // Compares return either 0 or all-ones
2342 return VTBits;
2343 }
2344
2345 return 1;
2346}
2347
2349 EVT) const {
2350 return MVT::i64;
2351}
2352
2354 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2355 unsigned *Fast) const {
2356 if (Subtarget->requiresStrictAlign())
2357 return false;
2358
2359 if (Fast) {
2360 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2361 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2362 // See comments in performSTORECombine() for more details about
2363 // these conditions.
2364
2365 // Code that uses clang vector extensions can mark that it
2366 // wants unaligned accesses to be treated as fast by
2367 // underspecifying alignment to be 1 or 2.
2368 Alignment <= 2 ||
2369
2370 // Disregard v2i64. Memcpy lowering produces those and splitting
2371 // them regresses performance on micro-benchmarks and olden/bh.
2372 VT == MVT::v2i64;
2373 }
2374 return true;
2375}
2376
2377// Same as above but handling LLTs instead.
2379 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2380 unsigned *Fast) const {
2381 if (Subtarget->requiresStrictAlign())
2382 return false;
2383
2384 if (Fast) {
2385 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2386 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2387 Ty.getSizeInBytes() != 16 ||
2388 // See comments in performSTORECombine() for more details about
2389 // these conditions.
2390
2391 // Code that uses clang vector extensions can mark that it
2392 // wants unaligned accesses to be treated as fast by
2393 // underspecifying alignment to be 1 or 2.
2394 Alignment <= 2 ||
2395
2396 // Disregard v2i64. Memcpy lowering produces those and splitting
2397 // them regresses performance on micro-benchmarks and olden/bh.
2398 Ty == LLT::fixed_vector(2, 64);
2399 }
2400 return true;
2401}
2402
2403FastISel *
2405 const TargetLibraryInfo *libInfo) const {
2406 return AArch64::createFastISel(funcInfo, libInfo);
2407}
2408
2409const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2410#define MAKE_CASE(V) \
2411 case V: \
2412 return #V;
2413 switch ((AArch64ISD::NodeType)Opcode) {
2415 break;
2732 }
2733#undef MAKE_CASE
2734 return nullptr;
2735}
2736
2739 MachineBasicBlock *MBB) const {
2740 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2741 // phi node:
2742
2743 // OrigBB:
2744 // [... previous instrs leading to comparison ...]
2745 // b.ne TrueBB
2746 // b EndBB
2747 // TrueBB:
2748 // ; Fallthrough
2749 // EndBB:
2750 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2751
2752 MachineFunction *MF = MBB->getParent();
2753 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2754 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2755 DebugLoc DL = MI.getDebugLoc();
2757
2758 Register DestReg = MI.getOperand(0).getReg();
2759 Register IfTrueReg = MI.getOperand(1).getReg();
2760 Register IfFalseReg = MI.getOperand(2).getReg();
2761 unsigned CondCode = MI.getOperand(3).getImm();
2762 bool NZCVKilled = MI.getOperand(4).isKill();
2763
2764 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2765 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2766 MF->insert(It, TrueBB);
2767 MF->insert(It, EndBB);
2768
2769 // Transfer rest of current basic-block to EndBB
2770 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2771 MBB->end());
2773
2774 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2775 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2776 MBB->addSuccessor(TrueBB);
2777 MBB->addSuccessor(EndBB);
2778
2779 // TrueBB falls through to the end.
2780 TrueBB->addSuccessor(EndBB);
2781
2782 if (!NZCVKilled) {
2783 TrueBB->addLiveIn(AArch64::NZCV);
2784 EndBB->addLiveIn(AArch64::NZCV);
2785 }
2786
2787 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2788 .addReg(IfTrueReg)
2789 .addMBB(TrueBB)
2790 .addReg(IfFalseReg)
2791 .addMBB(MBB);
2792
2793 MI.eraseFromParent();
2794 return EndBB;
2795}
2796
2798 MachineInstr &MI, MachineBasicBlock *BB) const {
2800 BB->getParent()->getFunction().getPersonalityFn())) &&
2801 "SEH does not use catchret!");
2802 return BB;
2803}
2804
2807 MachineBasicBlock *MBB) const {
2808 MachineFunction &MF = *MBB->getParent();
2809 MachineBasicBlock::iterator MBBI = MI.getIterator();
2811 const AArch64InstrInfo &TII =
2812 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2813 Register TargetReg = MI.getOperand(0).getReg();
2815 TII.probedStackAlloc(MBBI, TargetReg, false);
2816
2817 MI.eraseFromParent();
2818 return NextInst->getParent();
2819}
2820
2822AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2824 MachineBasicBlock *BB) const {
2825 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2826 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2827
2828 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2829 MIB.add(MI.getOperand(1)); // slice index register
2830 MIB.add(MI.getOperand(2)); // slice index offset
2831 MIB.add(MI.getOperand(3)); // pg
2832 MIB.add(MI.getOperand(4)); // base
2833 MIB.add(MI.getOperand(5)); // offset
2834
2835 MI.eraseFromParent(); // The pseudo is gone now.
2836 return BB;
2837}
2838
2841 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2843 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2844
2845 MIB.addReg(AArch64::ZA, RegState::Define);
2846 MIB.add(MI.getOperand(0)); // Vector select register
2847 MIB.add(MI.getOperand(1)); // Vector select offset
2848 MIB.add(MI.getOperand(2)); // Base
2849 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2850
2851 MI.eraseFromParent(); // The pseudo is gone now.
2852 return BB;
2853}
2854
2857 unsigned Opcode,
2858 bool Op0IsDef) const {
2859 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2861
2862 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2863 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2864 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2865 MIB.add(MI.getOperand(I));
2866
2867 MI.eraseFromParent(); // The pseudo is gone now.
2868 return BB;
2869}
2870
2872AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2874 MachineBasicBlock *BB, bool HasTile) const {
2875 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2876 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2877 unsigned StartIdx = 0;
2878
2879 if (HasTile) {
2880 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2881 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2882 StartIdx = 1;
2883 } else
2884 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
2885
2886 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2887 MIB.add(MI.getOperand(I));
2888
2889 MI.eraseFromParent(); // The pseudo is gone now.
2890 return BB;
2891}
2892
2895 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2897 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2898 MIB.add(MI.getOperand(0)); // Mask
2899
2900 unsigned Mask = MI.getOperand(0).getImm();
2901 for (unsigned I = 0; I < 8; I++) {
2902 if (Mask & (1 << I))
2903 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2904 }
2905
2906 MI.eraseFromParent(); // The pseudo is gone now.
2907 return BB;
2908}
2909
2911 MachineInstr &MI, MachineBasicBlock *BB) const {
2912
2913 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
2914 if (SMEOrigInstr != -1) {
2915 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2916 uint64_t SMEMatrixType =
2917 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
2918 switch (SMEMatrixType) {
2920 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false);
2922 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true);
2924 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true);
2926 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true);
2928 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true);
2930 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true);
2931 }
2932 }
2933
2934 switch (MI.getOpcode()) {
2935 default:
2936#ifndef NDEBUG
2937 MI.dump();
2938#endif
2939 llvm_unreachable("Unexpected instruction for custom inserter!");
2940
2941 case AArch64::F128CSEL:
2942 return EmitF128CSEL(MI, BB);
2943 case TargetOpcode::STATEPOINT:
2944 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2945 // while bl call instruction (where statepoint will be lowered at the end)
2946 // has implicit def. This def is early-clobber as it will be set at
2947 // the moment of the call and earlier than any use is read.
2948 // Add this implicit dead def here as a workaround.
2949 MI.addOperand(*MI.getMF(),
2951 AArch64::LR, /*isDef*/ true,
2952 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2953 /*isUndef*/ false, /*isEarlyClobber*/ true));
2954 [[fallthrough]];
2955 case TargetOpcode::STACKMAP:
2956 case TargetOpcode::PATCHPOINT:
2957 return emitPatchPoint(MI, BB);
2958
2959 case TargetOpcode::PATCHABLE_EVENT_CALL:
2960 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
2961 return BB;
2962
2963 case AArch64::CATCHRET:
2964 return EmitLoweredCatchRet(MI, BB);
2965
2966 case AArch64::PROBED_STACKALLOC_DYN:
2967 return EmitDynamicProbedAlloc(MI, BB);
2968
2969 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2970 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2971 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2972 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2973 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2974 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2975 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2976 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2977 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2978 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2979 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2980 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2981 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2982 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2983 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2984 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2985 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2986 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2987 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2988 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2989 case AArch64::LDR_ZA_PSEUDO:
2990 return EmitFill(MI, BB);
2991 case AArch64::LDR_TX_PSEUDO:
2992 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
2993 case AArch64::STR_TX_PSEUDO:
2994 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
2995 case AArch64::ZERO_M_PSEUDO:
2996 return EmitZero(MI, BB);
2997 case AArch64::ZERO_T_PSEUDO:
2998 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
2999 }
3000}
3001
3002//===----------------------------------------------------------------------===//
3003// AArch64 Lowering private implementation.
3004//===----------------------------------------------------------------------===//
3005
3006//===----------------------------------------------------------------------===//
3007// Lowering Code
3008//===----------------------------------------------------------------------===//
3009
3010// Forward declarations of SVE fixed length lowering helpers
3015 SelectionDAG &DAG);
3018 EVT VT);
3019
3020/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3021static bool isZerosVector(const SDNode *N) {
3022 // Look through a bit convert.
3023 while (N->getOpcode() == ISD::BITCAST)
3024 N = N->getOperand(0).getNode();
3025
3027 return true;
3028
3029 if (N->getOpcode() != AArch64ISD::DUP)
3030 return false;
3031
3032 auto Opnd0 = N->getOperand(0);
3033 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3034}
3035
3036/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3037/// CC
3039 switch (CC) {
3040 default:
3041 llvm_unreachable("Unknown condition code!");
3042 case ISD::SETNE:
3043 return AArch64CC::NE;
3044 case ISD::SETEQ:
3045 return AArch64CC::EQ;
3046 case ISD::SETGT:
3047 return AArch64CC::GT;
3048 case ISD::SETGE:
3049 return AArch64CC::GE;
3050 case ISD::SETLT:
3051 return AArch64CC::LT;
3052 case ISD::SETLE:
3053 return AArch64CC::LE;
3054 case ISD::SETUGT:
3055 return AArch64CC::HI;
3056 case ISD::SETUGE:
3057 return AArch64CC::HS;
3058 case ISD::SETULT:
3059 return AArch64CC::LO;
3060 case ISD::SETULE:
3061 return AArch64CC::LS;
3062 }
3063}
3064
3065/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3067 AArch64CC::CondCode &CondCode,
3068 AArch64CC::CondCode &CondCode2) {
3069 CondCode2 = AArch64CC::AL;
3070 switch (CC) {
3071 default:
3072 llvm_unreachable("Unknown FP condition!");
3073 case ISD::SETEQ:
3074 case ISD::SETOEQ:
3075 CondCode = AArch64CC::EQ;
3076 break;
3077 case ISD::SETGT:
3078 case ISD::SETOGT:
3079 CondCode = AArch64CC::GT;
3080 break;
3081 case ISD::SETGE:
3082 case ISD::SETOGE:
3083 CondCode = AArch64CC::GE;
3084 break;
3085 case ISD::SETOLT:
3086 CondCode = AArch64CC::MI;
3087 break;
3088 case ISD::SETOLE:
3089 CondCode = AArch64CC::LS;
3090 break;
3091 case ISD::SETONE:
3092 CondCode = AArch64CC::MI;
3093 CondCode2 = AArch64CC::GT;
3094 break;
3095 case ISD::SETO:
3096 CondCode = AArch64CC::VC;
3097 break;
3098 case ISD::SETUO:
3099 CondCode = AArch64CC::VS;
3100 break;
3101 case ISD::SETUEQ:
3102 CondCode = AArch64CC::EQ;
3103 CondCode2 = AArch64CC::VS;
3104 break;
3105 case ISD::SETUGT:
3106 CondCode = AArch64CC::HI;
3107 break;
3108 case ISD::SETUGE:
3109 CondCode = AArch64CC::PL;
3110 break;
3111 case ISD::SETLT:
3112 case ISD::SETULT:
3113 CondCode = AArch64CC::LT;
3114 break;
3115 case ISD::SETLE:
3116 case ISD::SETULE:
3117 CondCode = AArch64CC::LE;
3118 break;
3119 case ISD::SETNE:
3120 case ISD::SETUNE:
3121 CondCode = AArch64CC::NE;
3122 break;
3123 }
3124}
3125
3126/// Convert a DAG fp condition code to an AArch64 CC.
3127/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3128/// should be AND'ed instead of OR'ed.
3130 AArch64CC::CondCode &CondCode,
3131 AArch64CC::CondCode &CondCode2) {
3132 CondCode2 = AArch64CC::AL;
3133 switch (CC) {
3134 default:
3135 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3136 assert(CondCode2 == AArch64CC::AL);
3137 break;
3138 case ISD::SETONE:
3139 // (a one b)
3140 // == ((a olt b) || (a ogt b))
3141 // == ((a ord b) && (a une b))
3142 CondCode = AArch64CC::VC;
3143 CondCode2 = AArch64CC::NE;
3144 break;
3145 case ISD::SETUEQ:
3146 // (a ueq b)
3147 // == ((a uno b) || (a oeq b))
3148 // == ((a ule b) && (a uge b))
3149 CondCode = AArch64CC::PL;
3150 CondCode2 = AArch64CC::LE;
3151 break;
3152 }
3153}
3154
3155/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3156/// CC usable with the vector instructions. Fewer operations are available
3157/// without a real NZCV register, so we have to use less efficient combinations
3158/// to get the same effect.
3160 AArch64CC::CondCode &CondCode,
3161 AArch64CC::CondCode &CondCode2,
3162 bool &Invert) {
3163 Invert = false;
3164 switch (CC) {
3165 default:
3166 // Mostly the scalar mappings work fine.
3167 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3168 break;
3169 case ISD::SETUO:
3170 Invert = true;
3171 [[fallthrough]];
3172 case ISD::SETO:
3173 CondCode = AArch64CC::MI;
3174 CondCode2 = AArch64CC::GE;
3175 break;
3176 case ISD::SETUEQ:
3177 case ISD::SETULT:
3178 case ISD::SETULE:
3179 case ISD::SETUGT:
3180 case ISD::SETUGE:
3181 // All of the compare-mask comparisons are ordered, but we can switch
3182 // between the two by a double inversion. E.g. ULE == !OGT.
3183 Invert = true;
3184 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3185 CondCode, CondCode2);
3186 break;
3187 }
3188}
3189
3191 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3192 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3193 LLVM_DEBUG(dbgs() << "Is imm " << C
3194 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3195 return IsLegal;
3196}
3197
3198// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3199// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3200// can be set differently by this operation. It comes down to whether
3201// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3202// everything is fine. If not then the optimization is wrong. Thus general
3203// comparisons are only valid if op2 != 0.
3204//
3205// So, finally, the only LLVM-native comparisons that don't mention C and V
3206// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
3207// the absence of information about op2.
3209 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3210 (CC == ISD::SETEQ || CC == ISD::SETNE);
3211}
3212
3214 SelectionDAG &DAG, SDValue Chain,
3215 bool IsSignaling) {
3216 EVT VT = LHS.getValueType();
3217 assert(VT != MVT::f128);
3218
3219 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3220
3221 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3222 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3223 {Chain, LHS});
3224 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3225 {LHS.getValue(1), RHS});
3226 Chain = RHS.getValue(1);
3227 VT = MVT::f32;
3228 }
3229 unsigned Opcode =
3231 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3232}
3233
3235 const SDLoc &dl, SelectionDAG &DAG) {
3236 EVT VT = LHS.getValueType();
3237 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3238
3239 if (VT.isFloatingPoint()) {
3240 assert(VT != MVT::f128);
3241 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3242 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3243 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3244 VT = MVT::f32;
3245 }
3246 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3247 }
3248
3249 // The CMP instruction is just an alias for SUBS, and representing it as
3250 // SUBS means that it's possible to get CSE with subtract operations.
3251 // A later phase can perform the optimization of setting the destination
3252 // register to WZR/XZR if it ends up being unused.
3253 unsigned Opcode = AArch64ISD::SUBS;
3254
3255 if (isCMN(RHS, CC)) {
3256 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3257 Opcode = AArch64ISD::ADDS;
3258 RHS = RHS.getOperand(1);
3259 } else if (isCMN(LHS, CC)) {
3260 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3261 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3262 Opcode = AArch64ISD::ADDS;
3263 LHS = LHS.getOperand(1);
3264 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3265 if (LHS.getOpcode() == ISD::AND) {
3266 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3267 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3268 // of the signed comparisons.
3269 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3270 DAG.getVTList(VT, MVT_CC),
3271 LHS.getOperand(0),
3272 LHS.getOperand(1));
3273 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3274 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3275 return ANDSNode.getValue(1);
3276 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3277 // Use result of ANDS
3278 return LHS.getValue(1);
3279 }
3280 }
3281
3282 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3283 .getValue(1);
3284}
3285
3286/// \defgroup AArch64CCMP CMP;CCMP matching
3287///
3288/// These functions deal with the formation of CMP;CCMP;... sequences.
3289/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3290/// a comparison. They set the NZCV flags to a predefined value if their
3291/// predicate is false. This allows to express arbitrary conjunctions, for
3292/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3293/// expressed as:
3294/// cmp A
3295/// ccmp B, inv(CB), CA
3296/// check for CB flags
3297///
3298/// This naturally lets us implement chains of AND operations with SETCC
3299/// operands. And we can even implement some other situations by transforming
3300/// them:
3301/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3302/// negating the flags used in a CCMP/FCCMP operations.
3303/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3304/// by negating the flags we test for afterwards. i.e.
3305/// NEG (CMP CCMP CCCMP ...) can be implemented.
3306/// - Note that we can only ever negate all previously processed results.
3307/// What we can not implement by flipping the flags to test is a negation
3308/// of two sub-trees (because the negation affects all sub-trees emitted so
3309/// far, so the 2nd sub-tree we emit would also affect the first).
3310/// With those tools we can implement some OR operations:
3311/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3312/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3313/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3314/// elimination rules from earlier to implement the whole thing as a
3315/// CCMP/FCCMP chain.
3316///
3317/// As complete example:
3318/// or (or (setCA (cmp A)) (setCB (cmp B)))
3319/// (and (setCC (cmp C)) (setCD (cmp D)))"
3320/// can be reassociated to:
3321/// or (and (setCC (cmp C)) setCD (cmp D))
3322// (or (setCA (cmp A)) (setCB (cmp B)))
3323/// can be transformed to:
3324/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3325/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3326/// which can be implemented as:
3327/// cmp C
3328/// ccmp D, inv(CD), CC
3329/// ccmp A, CA, inv(CD)
3330/// ccmp B, CB, inv(CA)
3331/// check for CB flags
3332///
3333/// A counterexample is "or (and A B) (and C D)" which translates to
3334/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3335/// can only implement 1 of the inner (not) operations, but not both!
3336/// @{
3337
3338/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3340 ISD::CondCode CC, SDValue CCOp,
3341 AArch64CC::CondCode Predicate,
3342 AArch64CC::CondCode OutCC,
3343 const SDLoc &DL, SelectionDAG &DAG) {
3344 unsigned Opcode = 0;
3345 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3346
3347 if (LHS.getValueType().isFloatingPoint()) {
3348 assert(LHS.getValueType() != MVT::f128);
3349 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3350 LHS.getValueType() == MVT::bf16) {
3351 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3352 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3353 }
3354 Opcode = AArch64ISD::FCCMP;
3355 } else if (RHS.getOpcode() == ISD::SUB) {
3356 SDValue SubOp0 = RHS.getOperand(0);
3357 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3358 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3359 Opcode = AArch64ISD::CCMN;
3360 RHS = RHS.getOperand(1);
3361 }
3362 }
3363 if (Opcode == 0)
3364 Opcode = AArch64ISD::CCMP;
3365
3366 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3368 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3369 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3370 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3371}
3372
3373/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3374/// expressed as a conjunction. See \ref AArch64CCMP.
3375/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3376/// changing the conditions on the SETCC tests.
3377/// (this means we can call emitConjunctionRec() with
3378/// Negate==true on this sub-tree)
3379/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3380/// cannot do the negation naturally. We are required to
3381/// emit the subtree first in this case.
3382/// \param WillNegate Is true if are called when the result of this
3383/// subexpression must be negated. This happens when the
3384/// outer expression is an OR. We can use this fact to know
3385/// that we have a double negation (or (or ...) ...) that
3386/// can be implemented for free.
3387static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3388 bool &MustBeFirst, bool WillNegate,
3389 unsigned Depth = 0) {
3390 if (!Val.hasOneUse())
3391 return false;
3392 unsigned Opcode = Val->getOpcode();
3393 if (Opcode == ISD::SETCC) {
3394 if (Val->getOperand(0).getValueType() == MVT::f128)
3395 return false;
3396 CanNegate = true;
3397 MustBeFirst = false;
3398 return true;
3399 }
3400 // Protect against exponential runtime and stack overflow.
3401 if (Depth > 6)
3402 return false;
3403 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3404 bool IsOR = Opcode == ISD::OR;
3405 SDValue O0 = Val->getOperand(0);
3406 SDValue O1 = Val->getOperand(1);
3407 bool CanNegateL;
3408 bool MustBeFirstL;
3409 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3410 return false;
3411 bool CanNegateR;
3412 bool MustBeFirstR;
3413 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3414 return false;
3415
3416 if (MustBeFirstL && MustBeFirstR)
3417 return false;
3418
3419 if (IsOR) {
3420 // For an OR expression we need to be able to naturally negate at least
3421 // one side or we cannot do the transformation at all.
3422 if (!CanNegateL && !CanNegateR)
3423 return false;
3424 // If we the result of the OR will be negated and we can naturally negate
3425 // the leafs, then this sub-tree as a whole negates naturally.
3426 CanNegate = WillNegate && CanNegateL && CanNegateR;
3427 // If we cannot naturally negate the whole sub-tree, then this must be
3428 // emitted first.
3429 MustBeFirst = !CanNegate;
3430 } else {
3431 assert(Opcode == ISD::AND && "Must be OR or AND");
3432 // We cannot naturally negate an AND operation.
3433 CanNegate = false;
3434 MustBeFirst = MustBeFirstL || MustBeFirstR;
3435 }
3436 return true;
3437 }
3438 return false;
3439}
3440
3441/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3442/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3443/// Tries to transform the given i1 producing node @p Val to a series compare
3444/// and conditional compare operations. @returns an NZCV flags producing node
3445/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3446/// transformation was not possible.
3447/// \p Negate is true if we want this sub-tree being negated just by changing
3448/// SETCC conditions.
3450 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3451 AArch64CC::CondCode Predicate) {
3452 // We're at a tree leaf, produce a conditional comparison operation.
3453 unsigned Opcode = Val->getOpcode();
3454 if (Opcode == ISD::SETCC) {
3455 SDValue LHS = Val->getOperand(0);
3456 SDValue RHS = Val->getOperand(1);
3457 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3458 bool isInteger = LHS.getValueType().isInteger();
3459 if (Negate)
3460 CC = getSetCCInverse(CC, LHS.getValueType());
3461 SDLoc DL(Val);
3462 // Determine OutCC and handle FP special case.
3463 if (isInteger) {
3464 OutCC = changeIntCCToAArch64CC(CC);
3465 } else {
3466 assert(LHS.getValueType().isFloatingPoint());
3467 AArch64CC::CondCode ExtraCC;
3468 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3469 // Some floating point conditions can't be tested with a single condition
3470 // code. Construct an additional comparison in this case.
3471 if (ExtraCC != AArch64CC::AL) {
3472 SDValue ExtraCmp;
3473 if (!CCOp.getNode())
3474 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3475 else
3476 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3477 ExtraCC, DL, DAG);
3478 CCOp = ExtraCmp;
3479 Predicate = ExtraCC;
3480 }
3481 }
3482
3483 // Produce a normal comparison if we are first in the chain
3484 if (!CCOp)
3485 return emitComparison(LHS, RHS, CC, DL, DAG);
3486 // Otherwise produce a ccmp.
3487 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3488 DAG);
3489 }
3490 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3491
3492 bool IsOR = Opcode == ISD::OR;
3493
3494 SDValue LHS = Val->getOperand(0);
3495 bool CanNegateL;
3496 bool MustBeFirstL;
3497 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3498 assert(ValidL && "Valid conjunction/disjunction tree");
3499 (void)ValidL;
3500
3501 SDValue RHS = Val->getOperand(1);
3502 bool CanNegateR;
3503 bool MustBeFirstR;
3504 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3505 assert(ValidR && "Valid conjunction/disjunction tree");
3506 (void)ValidR;
3507
3508 // Swap sub-tree that must come first to the right side.
3509 if (MustBeFirstL) {
3510 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3511 std::swap(LHS, RHS);
3512 std::swap(CanNegateL, CanNegateR);
3513 std::swap(MustBeFirstL, MustBeFirstR);
3514 }
3515
3516 bool NegateR;
3517 bool NegateAfterR;
3518 bool NegateL;
3519 bool NegateAfterAll;
3520 if (Opcode == ISD::OR) {
3521 // Swap the sub-tree that we can negate naturally to the left.
3522 if (!CanNegateL) {
3523 assert(CanNegateR && "at least one side must be negatable");
3524 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3525 assert(!Negate);
3526 std::swap(LHS, RHS);
3527 NegateR = false;
3528 NegateAfterR = true;
3529 } else {
3530 // Negate the left sub-tree if possible, otherwise negate the result.
3531 NegateR = CanNegateR;
3532 NegateAfterR = !CanNegateR;
3533 }
3534 NegateL = true;
3535 NegateAfterAll = !Negate;
3536 } else {
3537 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3538 assert(!Negate && "Valid conjunction/disjunction tree");
3539
3540 NegateL = false;
3541 NegateR = false;
3542 NegateAfterR = false;
3543 NegateAfterAll = false;
3544 }
3545
3546 // Emit sub-trees.
3547 AArch64CC::CondCode RHSCC;
3548 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3549 if (NegateAfterR)
3550 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3551 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3552 if (NegateAfterAll)
3553 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3554 return CmpL;
3555}
3556
3557/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3558/// In some cases this is even possible with OR operations in the expression.
3559/// See \ref AArch64CCMP.
3560/// \see emitConjunctionRec().
3562 AArch64CC::CondCode &OutCC) {
3563 bool DummyCanNegate;
3564 bool DummyMustBeFirst;
3565 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3566 return SDValue();
3567
3568 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3569}
3570
3571/// @}
3572
3573/// Returns how profitable it is to fold a comparison's operand's shift and/or
3574/// extension operations.
3576 auto isSupportedExtend = [&](SDValue V) {
3577 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3578 return true;
3579
3580 if (V.getOpcode() == ISD::AND)
3581 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3582 uint64_t Mask = MaskCst->getZExtValue();
3583 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3584 }
3585
3586 return false;
3587 };
3588
3589 if (!Op.hasOneUse())
3590 return 0;
3591
3592 if (isSupportedExtend(Op))
3593 return 1;
3594
3595 unsigned Opc = Op.getOpcode();
3596 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3597 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3598 uint64_t Shift = ShiftCst->getZExtValue();
3599 if (isSupportedExtend(Op.getOperand(0)))
3600 return (Shift <= 4) ? 2 : 1;
3601 EVT VT = Op.getValueType();
3602 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3603 return 1;
3604 }
3605
3606 return 0;
3607}
3608
3610 SDValue &AArch64cc, SelectionDAG &DAG,
3611 const SDLoc &dl) {
3612 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3613 EVT VT = RHS.getValueType();
3614 uint64_t C = RHSC->getZExtValue();
3615 if (!isLegalArithImmed(C)) {
3616 // Constant does not fit, try adjusting it by one?
3617 switch (CC) {
3618 default:
3619 break;
3620 case ISD::SETLT:
3621 case ISD::SETGE:
3622 if ((VT == MVT::i32 && C != 0x80000000 &&
3623 isLegalArithImmed((uint32_t)(C - 1))) ||
3624 (VT == MVT::i64 && C != 0x80000000ULL &&
3625 isLegalArithImmed(C - 1ULL))) {
3627 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3628 RHS = DAG.getConstant(C, dl, VT);
3629 }
3630 break;
3631 case ISD::SETULT:
3632 case ISD::SETUGE:
3633 if ((VT == MVT::i32 && C != 0 &&
3634 isLegalArithImmed((uint32_t)(C - 1))) ||
3635 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3637 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3638 RHS = DAG.getConstant(C, dl, VT);
3639 }
3640 break;
3641 case ISD::SETLE:
3642 case ISD::SETGT:
3643 if ((VT == MVT::i32 && C != INT32_MAX &&
3644 isLegalArithImmed((uint32_t)(C + 1))) ||
3645 (VT == MVT::i64 && C != INT64_MAX &&
3646 isLegalArithImmed(C + 1ULL))) {
3648 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3649 RHS = DAG.getConstant(C, dl, VT);
3650 }
3651 break;
3652 case ISD::SETULE:
3653 case ISD::SETUGT:
3654 if ((VT == MVT::i32 && C != UINT32_MAX &&
3655 isLegalArithImmed((uint32_t)(C + 1))) ||
3656 (VT == MVT::i64 && C != UINT64_MAX &&
3657 isLegalArithImmed(C + 1ULL))) {
3659 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3660 RHS = DAG.getConstant(C, dl, VT);
3661 }
3662 break;
3663 }
3664 }
3665 }
3666
3667 // Comparisons are canonicalized so that the RHS operand is simpler than the
3668 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3669 // can fold some shift+extend operations on the RHS operand, so swap the
3670 // operands if that can be done.
3671 //
3672 // For example:
3673 // lsl w13, w11, #1
3674 // cmp w13, w12
3675 // can be turned into:
3676 // cmp w12, w11, lsl #1
3677 if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
3678 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3679
3681 std::swap(LHS, RHS);
3683 }
3684 }
3685
3686 SDValue Cmp;
3687 AArch64CC::CondCode AArch64CC;
3688 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3689 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3690
3691 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3692 // For the i8 operand, the largest immediate is 255, so this can be easily
3693 // encoded in the compare instruction. For the i16 operand, however, the
3694 // largest immediate cannot be encoded in the compare.
3695 // Therefore, use a sign extending load and cmn to avoid materializing the
3696 // -1 constant. For example,
3697 // movz w1, #65535
3698 // ldrh w0, [x0, #0]
3699 // cmp w0, w1
3700 // >
3701 // ldrsh w0, [x0, #0]
3702 // cmn w0, #1
3703 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3704 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3705 // ensure both the LHS and RHS are truly zero extended and to make sure the
3706 // transformation is profitable.
3707 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3708 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3709 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3710 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3711 int16_t ValueofRHS = RHS->getAsZExtVal();
3712 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3713 SDValue SExt =
3714 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3715 DAG.getValueType(MVT::i16));
3716 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3717 RHS.getValueType()),
3718 CC, dl, DAG);
3719 AArch64CC = changeIntCCToAArch64CC(CC);
3720 }
3721 }
3722
3723 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3724 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3725 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3726 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3727 }
3728 }
3729 }
3730
3731 if (!Cmp) {
3732 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3733 AArch64CC = changeIntCCToAArch64CC(CC);
3734 }
3735 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3736 return Cmp;
3737}
3738
3739static std::pair<SDValue, SDValue>
3741 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3742 "Unsupported value type");
3743 SDValue Value, Overflow;
3744 SDLoc DL(Op);
3745 SDValue LHS = Op.getOperand(0);
3746 SDValue RHS = Op.getOperand(1);
3747 unsigned Opc = 0;
3748 switch (Op.getOpcode()) {
3749 default:
3750 llvm_unreachable("Unknown overflow instruction!");
3751 case ISD::SADDO:
3752 Opc = AArch64ISD::ADDS;
3753 CC = AArch64CC::VS;
3754 break;
3755 case ISD::UADDO:
3756 Opc = AArch64ISD::ADDS;
3757 CC = AArch64CC::HS;
3758 break;
3759 case ISD::SSUBO:
3760 Opc = AArch64ISD::SUBS;
3761 CC = AArch64CC::VS;
3762 break;
3763 case ISD::USUBO:
3764 Opc = AArch64ISD::SUBS;
3765 CC = AArch64CC::LO;
3766 break;
3767 // Multiply needs a little bit extra work.
3768 case ISD::SMULO:
3769 case ISD::UMULO: {
3770 CC = AArch64CC::NE;
3771 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3772 if (Op.getValueType() == MVT::i32) {
3773 // Extend to 64-bits, then perform a 64-bit multiply.
3774 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3775 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3776 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3777 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3778 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3779
3780 // Check that the result fits into a 32-bit integer.
3781 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3782 if (IsSigned) {
3783 // cmp xreg, wreg, sxtw
3784 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3785 Overflow =
3786 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3787 } else {
3788 // tst xreg, #0xffffffff00000000
3789 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3790 Overflow =
3791 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3792 }
3793 break;
3794 }
3795 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3796 // For the 64 bit multiply
3797 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3798 if (IsSigned) {
3799 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3800 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3801 DAG.getConstant(63, DL, MVT::i64));
3802 // It is important that LowerBits is last, otherwise the arithmetic
3803 // shift will not be folded into the compare (SUBS).
3804 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3805 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3806 .getValue(1);
3807 } else {
3808 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3809 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3810 Overflow =
3811 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3812 DAG.getConstant(0, DL, MVT::i64),
3813 UpperBits).getValue(1);
3814 }
3815 break;
3816 }
3817 } // switch (...)
3818
3819 if (Opc) {
3820 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3821
3822 // Emit the AArch64 operation with overflow check.
3823 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3824 Overflow = Value.getValue(1);
3825 }
3826 return std::make_pair(Value, Overflow);
3827}
3828
3829SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3830 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
3831 !Subtarget->isNeonAvailable()))
3832 return LowerToScalableOp(Op, DAG);
3833
3834 SDValue Sel = Op.getOperand(0);
3835 SDValue Other = Op.getOperand(1);
3836 SDLoc dl(Sel);
3837
3838 // If the operand is an overflow checking operation, invert the condition
3839 // code and kill the Not operation. I.e., transform:
3840 // (xor (overflow_op_bool, 1))
3841 // -->
3842 // (csel 1, 0, invert(cc), overflow_op_bool)
3843 // ... which later gets transformed to just a cset instruction with an
3844 // inverted condition code, rather than a cset + eor sequence.
3846 // Only lower legal XALUO ops.
3848 return SDValue();
3849
3850 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3851 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3853 SDValue Value, Overflow;
3854 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3855 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3856 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3857 CCVal, Overflow);
3858 }
3859 // If neither operand is a SELECT_CC, give up.
3860 if (Sel.getOpcode() != ISD::SELECT_CC)
3861 std::swap(Sel, Other);
3862 if (Sel.getOpcode() != ISD::SELECT_CC)
3863 return Op;
3864
3865 // The folding we want to perform is:
3866 // (xor x, (select_cc a, b, cc, 0, -1) )
3867 // -->
3868 // (csel x, (xor x, -1), cc ...)
3869 //
3870 // The latter will get matched to a CSINV instruction.
3871
3872 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3873 SDValue LHS = Sel.getOperand(0);
3874 SDValue RHS = Sel.getOperand(1);
3875 SDValue TVal = Sel.getOperand(2);
3876 SDValue FVal = Sel.getOperand(3);
3877
3878 // FIXME: This could be generalized to non-integer comparisons.
3879 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3880 return Op;
3881
3882 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3883 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3884
3885 // The values aren't constants, this isn't the pattern we're looking for.
3886 if (!CFVal || !CTVal)
3887 return Op;
3888
3889 // We can commute the SELECT_CC by inverting the condition. This
3890 // might be needed to make this fit into a CSINV pattern.
3891 if (CTVal->isAllOnes() && CFVal->isZero()) {
3892 std::swap(TVal, FVal);
3893 std::swap(CTVal, CFVal);
3894 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3895 }
3896
3897 // If the constants line up, perform the transform!
3898 if (CTVal->isZero() && CFVal->isAllOnes()) {
3899 SDValue CCVal;
3900 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3901
3902 FVal = Other;
3903 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3904 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3905
3906 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3907 CCVal, Cmp);
3908 }
3909
3910 return Op;
3911}
3912
3913// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3914// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3915// sets 'C' bit to 0.
3917 SDLoc DL(Value);
3918 EVT VT = Value.getValueType();
3919 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3920 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3921 SDValue Cmp =
3922 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3923 return Cmp.getValue(1);
3924}
3925
3926// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3927// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3929 bool Invert) {
3930 assert(Glue.getResNo() == 1);
3931 SDLoc DL(Glue);
3932 SDValue Zero = DAG.getConstant(0, DL, VT);
3933 SDValue One = DAG.getConstant(1, DL, VT);
3934 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3935 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3936 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3937}
3938
3939// Value is 1 if 'V' bit of NZCV is 1, else 0
3941 assert(Glue.getResNo() == 1);
3942 SDLoc DL(Glue);
3943 SDValue Zero = DAG.getConstant(0, DL, VT);
3944 SDValue One = DAG.getConstant(1, DL, VT);
3945 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3946 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
3947}
3948
3949// This lowering is inefficient, but it will get cleaned up by
3950// `foldOverflowCheck`
3952 unsigned Opcode, bool IsSigned) {
3953 EVT VT0 = Op.getValue(0).getValueType();
3954 EVT VT1 = Op.getValue(1).getValueType();
3955
3956 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3957 return SDValue();
3958
3959 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3960 SDValue OpLHS = Op.getOperand(0);
3961 SDValue OpRHS = Op.getOperand(1);
3962 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3963
3964 SDLoc DL(Op);
3965 SDVTList VTs = DAG.getVTList(VT0, VT1);
3966
3967 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3968 OpRHS, OpCarryIn);
3969
3970 SDValue OutFlag =
3971 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3972 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3973
3974 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3975}
3976
3978 // Let legalize expand this if it isn't a legal type yet.
3979 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3980 return SDValue();
3981
3982 SDLoc dl(Op);
3984 // The actual operation that sets the overflow or carry flag.
3985 SDValue Value, Overflow;
3986 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3987
3988 // We use 0 and 1 as false and true values.
3989 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3990 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3991
3992 // We use an inverted condition, because the conditional select is inverted
3993 // too. This will allow it to be selected to a single instruction:
3994 // CSINC Wd, WZR, WZR, invert(cond).
3995 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3996 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3997 CCVal, Overflow);
3998
3999 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4000 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4001}
4002
4003// Prefetch operands are:
4004// 1: Address to prefetch
4005// 2: bool isWrite
4006// 3: int locality (0 = no locality ... 3 = extreme locality)
4007// 4: bool isDataCache
4009 SDLoc DL(Op);
4010 unsigned IsWrite = Op.getConstantOperandVal(2);
4011 unsigned Locality = Op.getConstantOperandVal(3);
4012 unsigned IsData = Op.getConstantOperandVal(4);
4013
4014 bool IsStream = !Locality;
4015 // When the locality number is set
4016 if (Locality) {
4017 // The front-end should have filtered out the out-of-range values
4018 assert(Locality <= 3 && "Prefetch locality out-of-range");
4019 // The locality degree is the opposite of the cache speed.
4020 // Put the number the other way around.
4021 // The encoding starts at 0 for level 1
4022 Locality = 3 - Locality;
4023 }
4024
4025 // built the mask value encoding the expected behavior.
4026 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4027 (!IsData << 3) | // IsDataCache bit
4028 (Locality << 1) | // Cache level bits
4029 (unsigned)IsStream; // Stream bit
4030 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4031 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4032 Op.getOperand(1));
4033}
4034
4035SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4036 SelectionDAG &DAG) const {
4037 EVT VT = Op.getValueType();
4038 if (VT.isScalableVector())
4039 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4040
4041 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4042 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4043
4044 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4045 return SDValue();
4046}
4047
4048SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4049 SelectionDAG &DAG) const {
4050 EVT VT = Op.getValueType();
4051 if (VT.isScalableVector())
4052 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4053
4054 bool IsStrict = Op->isStrictFPOpcode();
4055 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4056 EVT SrcVT = SrcVal.getValueType();
4057 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4058
4059 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4060 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4061
4062 // Expand cases where the result type is BF16 but we don't have hardware
4063 // instructions to lower it.
4064 if (VT.getScalarType() == MVT::bf16 &&
4065 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4066 Subtarget->hasBF16())) {
4067 SDLoc dl(Op);
4068 SDValue Narrow = SrcVal;
4069 SDValue NaN;
4070 EVT I32 = SrcVT.changeElementType(MVT::i32);
4071 EVT F32 = SrcVT.changeElementType(MVT::f32);
4072 if (SrcVT.getScalarType() == MVT::f32) {
4073 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4074 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4075 if (!NeverSNaN) {
4076 // Set the quiet bit.
4077 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4078 DAG.getConstant(0x400000, dl, I32));
4079 }
4080 } else if (SrcVT.getScalarType() == MVT::f64) {
4081 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4082 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4083 } else {
4084 return SDValue();
4085 }
4086 if (!Trunc) {
4087 SDValue One = DAG.getConstant(1, dl, I32);
4088 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4089 DAG.getShiftAmountConstant(16, I32, dl));
4090 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4091 SDValue RoundingBias =
4092 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4093 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4094 }
4095
4096 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4097 // 0x80000000.
4098 if (NaN) {
4099 SDValue IsNaN = DAG.getSetCC(
4100 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4101 SrcVal, SrcVal, ISD::SETUO);
4102 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4103 }
4104
4105 // Now that we have rounded, shift the bits into position.
4106 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4107 DAG.getShiftAmountConstant(16, I32, dl));
4108 if (VT.isVector()) {
4109 EVT I16 = I32.changeVectorElementType(MVT::i16);
4110 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4111 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4112 }
4113 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4114 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4115 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4116 : Result;
4117 }
4118
4119 if (SrcVT != MVT::f128) {
4120 // Expand cases where the input is a vector bigger than NEON.
4122 return SDValue();
4123
4124 // It's legal except when f128 is involved
4125 return Op;
4126 }
4127
4128 return SDValue();
4129}
4130
4131SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4132 SelectionDAG &DAG) const {
4133 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4134 // Any additional optimization in this function should be recorded
4135 // in the cost tables.
4136 bool IsStrict = Op->isStrictFPOpcode();
4137 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4138 EVT VT = Op.getValueType();
4139
4140 if (VT.isScalableVector()) {
4141 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4144 return LowerToPredicatedOp(Op, DAG, Opcode);
4145 }
4146
4147 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4148 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4149 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4150
4151 unsigned NumElts = InVT.getVectorNumElements();
4152
4153 // f16 conversions are promoted to f32 when full fp16 is not supported.
4154 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4155 InVT.getVectorElementType() == MVT::bf16) {
4156 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4157 SDLoc dl(Op);
4158 if (IsStrict) {
4159 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4160 {Op.getOperand(0), Op.getOperand(1)});
4161 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4162 {Ext.getValue(1), Ext.getValue(0)});
4163 }
4164 return DAG.getNode(
4165 Op.getOpcode(), dl, Op.getValueType(),
4166 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4167 }
4168
4169 uint64_t VTSize = VT.getFixedSizeInBits();
4170 uint64_t InVTSize = InVT.getFixedSizeInBits();
4171 if (VTSize < InVTSize) {
4172 SDLoc dl(Op);
4173 if (IsStrict) {
4175 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4176 {Op.getOperand(0), Op.getOperand(1)});
4177 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4178 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4179 }
4180 SDValue Cv =
4181 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4182 Op.getOperand(0));
4183 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4184 }
4185
4186 if (VTSize > InVTSize) {
4187 SDLoc dl(Op);
4188 MVT ExtVT =
4191 if (IsStrict) {
4192 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4193 {Op.getOperand(0), Op.getOperand(1)});
4194 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4195 {Ext.getValue(1), Ext.getValue(0)});
4196 }
4197 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4198 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4199 }
4200
4201 // Use a scalar operation for conversions between single-element vectors of
4202 // the same size.
4203 if (NumElts == 1) {
4204 SDLoc dl(Op);
4205 SDValue Extract = DAG.getNode(
4207 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4208 EVT ScalarVT = VT.getScalarType();
4209 if (IsStrict)
4210 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4211 {Op.getOperand(0), Extract});
4212 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4213 }
4214
4215 // Type changing conversions are illegal.
4216 return Op;
4217}
4218
4219SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4220 SelectionDAG &DAG) const {
4221 bool IsStrict = Op->isStrictFPOpcode();
4222 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4223
4224 if (SrcVal.getValueType().isVector())
4225 return LowerVectorFP_TO_INT(Op, DAG);
4226
4227 // f16 conversions are promoted to f32 when full fp16 is not supported.
4228 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4229 SrcVal.getValueType() == MVT::bf16) {
4230 SDLoc dl(Op);
4231 if (IsStrict) {
4232 SDValue Ext =
4233 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4234 {Op.getOperand(0), SrcVal});
4235 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4236 {Ext.getValue(1), Ext.getValue(0)});
4237 }
4238 return DAG.getNode(
4239 Op.getOpcode(), dl, Op.getValueType(),
4240 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4241 }
4242
4243 if (SrcVal.getValueType() != MVT::f128) {
4244 // It's legal except when f128 is involved
4245 return Op;
4246 }
4247
4248 return SDValue();
4249}
4250
4251SDValue
4252AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4253 SelectionDAG &DAG) const {
4254 // AArch64 FP-to-int conversions saturate to the destination element size, so
4255 // we can lower common saturating conversions to simple instructions.
4256 SDValue SrcVal = Op.getOperand(0);
4257 EVT SrcVT = SrcVal.getValueType();
4258 EVT DstVT = Op.getValueType();
4259 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4260
4261 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4262 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4263 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4264 assert(SatWidth <= DstElementWidth &&
4265 "Saturation width cannot exceed result width");
4266
4267 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4268 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4269 // types, so this is hard to reach.
4270 if (DstVT.isScalableVector())
4271 return SDValue();
4272
4273 EVT SrcElementVT = SrcVT.getVectorElementType();
4274
4275 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4276 if ((SrcElementVT == MVT::f16 &&
4277 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4278 SrcElementVT == MVT::bf16) {
4279 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4280 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4281 SrcVT = F32VT;
4282 SrcElementVT = MVT::f32;
4283 SrcElementWidth = 32;
4284 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4285 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4286 return SDValue();
4287
4288 SDLoc DL(Op);
4289 // Cases that we can emit directly.
4290 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4291 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4292 DAG.getValueType(DstVT.getScalarType()));
4293
4294 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4295 // result. This is only valid if the legal cvt is larger than the saturate
4296 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4297 // (at least until sqxtn is selected).
4298 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4299 return SDValue();
4300
4301 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4302 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4303 DAG.getValueType(IntVT.getScalarType()));
4304 SDValue Sat;
4305 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4306 SDValue MinC = DAG.getConstant(
4307 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4308 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4309 SDValue MaxC = DAG.getConstant(
4310 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4311 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4312 } else {
4313 SDValue MinC = DAG.getConstant(
4314 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4315 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4316 }
4317
4318 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4319}
4320
4321SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4322 SelectionDAG &DAG) const {
4323 // AArch64 FP-to-int conversions saturate to the destination register size, so
4324 // we can lower common saturating conversions to simple instructions.
4325 SDValue SrcVal = Op.getOperand(0);
4326 EVT SrcVT = SrcVal.getValueType();
4327
4328 if (SrcVT.isVector())
4329 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4330
4331 EVT DstVT = Op.getValueType();
4332 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4333 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4334 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4335 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4336
4337 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4338 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4339 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4340 SrcVT = MVT::f32;
4341 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4342 SrcVT != MVT::bf16)
4343 return SDValue();
4344
4345 SDLoc DL(Op);
4346 // Cases that we can emit directly.
4347 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4348 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4349 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4350 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4351 DAG.getValueType(DstVT));
4352
4353 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4354 // result. This is only valid if the legal cvt is larger than the saturate
4355 // width.
4356 if (DstWidth < SatWidth)
4357 return SDValue();
4358
4359 SDValue NativeCvt =
4360 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4361 SDValue Sat;
4362 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4363 SDValue MinC = DAG.getConstant(
4364 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4365 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4366 SDValue MaxC = DAG.getConstant(
4367 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4368 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4369 } else {
4370 SDValue MinC = DAG.getConstant(
4371 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4372 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4373 }
4374
4375 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4376}
4377
4378SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4379 SelectionDAG &DAG) const {
4380 EVT VT = Op.getValueType();
4381 SDValue Src = Op.getOperand(0);
4382 SDLoc DL(Op);
4383
4384 assert(VT.isVector() && "Expected vector type");
4385
4386 EVT CastVT =
4387 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4388
4389 // Round the floating-point value into a floating-point register with the
4390 // current rounding mode.
4391 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4392
4393 // Truncate the rounded floating point to an integer.
4394 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4396}
4397
4398SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4399 SelectionDAG &DAG) const {
4400 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4401 // Any additional optimization in this function should be recorded
4402 // in the cost tables.
4403 bool IsStrict = Op->isStrictFPOpcode();
4404 EVT VT = Op.getValueType();
4405 SDLoc dl(Op);
4406 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4407 EVT InVT = In.getValueType();
4408 unsigned Opc = Op.getOpcode();
4409 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4410
4411 if (VT.isScalableVector()) {
4412 if (InVT.getVectorElementType() == MVT::i1) {
4413 // We can't directly extend an SVE predicate; extend it first.
4414 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4415 EVT CastVT = getPromotedVTForPredicate(InVT);
4416 In = DAG.getNode(CastOpc, dl, CastVT, In);
4417 return DAG.getNode(Opc, dl, VT, In);
4418 }
4419
4420 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4422 return LowerToPredicatedOp(Op, DAG, Opcode);
4423 }
4424
4425 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4426 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4427 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4428
4429 // Promote bf16 conversions to f32.
4430 if (VT.getVectorElementType() == MVT::bf16) {
4431 EVT F32 = VT.changeElementType(MVT::f32);
4432 if (IsStrict) {
4433 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4434 {Op.getOperand(0), In});
4435 return DAG.getNode(
4436 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4437 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4438 }
4439 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4440 DAG.getNode(Op.getOpcode(), dl, F32, In),
4441 DAG.getIntPtrConstant(0, dl));
4442 }
4443
4444 uint64_t VTSize = VT.getFixedSizeInBits();
4445 uint64_t InVTSize = InVT.getFixedSizeInBits();
4446 if (VTSize < InVTSize) {
4447 MVT CastVT =
4449 InVT.getVectorNumElements());
4450 if (IsStrict) {
4451 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4452 {Op.getOperand(0), In});
4453 return DAG.getNode(
4454 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4455 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4456 }
4457 In = DAG.getNode(Opc, dl, CastVT, In);
4458 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4459 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4460 }
4461
4462 if (VTSize > InVTSize) {
4463 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4465 In = DAG.getNode(CastOpc, dl, CastVT, In);
4466 if (IsStrict)
4467 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4468 return DAG.getNode(Opc, dl, VT, In);
4469 }
4470
4471 // Use a scalar operation for conversions between single-element vectors of
4472 // the same size.
4473 if (VT.getVectorNumElements() == 1) {
4474 SDValue Extract = DAG.getNode(
4476 In, DAG.getConstant(0, dl, MVT::i64));
4477 EVT ScalarVT = VT.getScalarType();
4478 if (IsStrict)
4479 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4480 {Op.getOperand(0), Extract});
4481 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4482 }
4483
4484 return Op;
4485}
4486
4487SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4488 SelectionDAG &DAG) const {
4489 if (Op.getValueType().isVector())
4490 return LowerVectorINT_TO_FP(Op, DAG);
4491
4492 bool IsStrict = Op->isStrictFPOpcode();
4493 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4494
4495 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4496 Op->getOpcode() == ISD::SINT_TO_FP;
4497
4498 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4499 SDLoc dl(Op);
4500 if (IsStrict) {
4501 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4502 {Op.getOperand(0), SrcVal});
4503 return DAG.getNode(
4504 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4505 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4506 }
4507 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4508 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4509 DAG.getIntPtrConstant(0, dl));
4510 };
4511
4512 if (Op.getValueType() == MVT::bf16) {
4513 unsigned MaxWidth = IsSigned
4514 ? DAG.ComputeMaxSignificantBits(SrcVal)
4515 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4516 // bf16 conversions are promoted to f32 when converting from i16.
4517 if (MaxWidth <= 24) {
4518 return IntToFpViaPromotion(MVT::f32);
4519 }
4520
4521 // bf16 conversions are promoted to f64 when converting from i32.
4522 if (MaxWidth <= 53) {
4523 return IntToFpViaPromotion(MVT::f64);
4524 }
4525
4526 // We need to be careful about i64 -> bf16.
4527 // Consider an i32 22216703.
4528 // This number cannot be represented exactly as an f32 and so a itofp will
4529 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4530 // However, the correct bf16 was supposed to be 22151168.0
4531 // We need to use sticky rounding to get this correct.
4532 if (SrcVal.getValueType() == MVT::i64) {
4533 SDLoc DL(Op);
4534 // This algorithm is equivalent to the following:
4535 // uint64_t SrcHi = SrcVal & ~0xfffull;
4536 // uint64_t SrcLo = SrcVal & 0xfffull;
4537 // uint64_t Highest = SrcVal >> 53;
4538 // bool HasHighest = Highest != 0;
4539 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4540 // double Rounded = static_cast<double>(ToRound);
4541 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4542 // uint64_t HasLo = SrcLo != 0;
4543 // bool NeedsAdjustment = HasHighest & HasLo;
4544 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4545 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4546 // return static_cast<__bf16>(Adjusted);
4547 //
4548 // Essentially, what happens is that SrcVal either fits perfectly in a
4549 // double-precision value or it is too big. If it is sufficiently small,
4550 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4551 // ensure that u64 -> double has no rounding error by only using the 52
4552 // MSB of the input. The low order bits will get merged into a sticky bit
4553 // which will avoid issues incurred by double rounding.
4554
4555 // Signed conversion is more or less like so:
4556 // copysign((__bf16)abs(SrcVal), SrcVal)
4557 SDValue SignBit;
4558 if (IsSigned) {
4559 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4560 DAG.getConstant(1ull << 63, DL, MVT::i64));
4561 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4562 }
4563 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4564 DAG.getConstant(~0xfffull, DL, MVT::i64));
4565 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4566 DAG.getConstant(0xfffull, DL, MVT::i64));
4568 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4569 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4570 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4571 SDValue ToRound =
4572 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4573 SDValue Rounded =
4574 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4575 {Op.getOperand(0), ToRound})
4576 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4577
4578 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4579 if (SignBit) {
4580 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4581 }
4582
4583 SDValue HasHighest = DAG.getSetCC(
4584 DL,
4585 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4586 Highest, Zero64, ISD::SETNE);
4587
4588 SDValue HasLo = DAG.getSetCC(
4589 DL,
4590 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4591 SrcLo, Zero64, ISD::SETNE);
4592
4593 SDValue NeedsAdjustment =
4594 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4595 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4596
4597 SDValue AdjustedBits =
4598 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4599 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4600 return IsStrict
4602 {Op.getValueType(), MVT::Other},
4603 {Rounded.getValue(1), Adjusted,
4604 DAG.getIntPtrConstant(0, DL)})
4605 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4606 DAG.getIntPtrConstant(0, DL, true));
4607 }
4608 }
4609
4610 // f16 conversions are promoted to f32 when full fp16 is not supported.
4611 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4612 return IntToFpViaPromotion(MVT::f32);
4613 }
4614
4615 // i128 conversions are libcalls.
4616 if (SrcVal.getValueType() == MVT::i128)
4617 return SDValue();
4618
4619 // Other conversions are legal, unless it's to the completely software-based
4620 // fp128.
4621 if (Op.getValueType() != MVT::f128)
4622 return Op;
4623 return SDValue();
4624}
4625
4626SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4627 SelectionDAG &DAG) const {
4628 // For iOS, we want to call an alternative entry point: __sincos_stret,
4629 // which returns the values in two S / D registers.
4630 SDLoc dl(Op);
4631 SDValue Arg = Op.getOperand(0);
4632 EVT ArgVT = Arg.getValueType();
4633 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4634
4636 ArgListEntry Entry;
4637
4638 Entry.Node = Arg;
4639 Entry.Ty = ArgTy;
4640 Entry.IsSExt = false;
4641 Entry.IsZExt = false;
4642 Args.push_back(Entry);
4643
4644 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4645 : RTLIB::SINCOS_STRET_F32;
4646 const char *LibcallName = getLibcallName(LC);
4647 SDValue Callee =
4648 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4649
4650 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4652 CLI.setDebugLoc(dl)
4653 .setChain(DAG.getEntryNode())
4654 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4655
4656 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4657 return CallResult.first;
4658}
4659
4660static MVT getSVEContainerType(EVT ContentTy);
4661
4662SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4663 SelectionDAG &DAG) const {
4664 EVT OpVT = Op.getValueType();
4665 EVT ArgVT = Op.getOperand(0).getValueType();
4666
4668 return LowerFixedLengthBitcastToSVE(Op, DAG);
4669
4670 if (OpVT.isScalableVector()) {
4671 // Bitcasting between unpacked vector types of different element counts is
4672 // not a NOP because the live elements are laid out differently.
4673 // 01234567
4674 // e.g. nxv2i32 = XX??XX??
4675 // nxv4f16 = X?X?X?X?
4676 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4677 return SDValue();
4678
4679 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4680 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4681 "Expected int->fp bitcast!");
4682 SDValue ExtResult =
4684 Op.getOperand(0));
4685 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4686 }
4687 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4688 }
4689
4690 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4691 return SDValue();
4692
4693 // Bitcasts between f16 and bf16 are legal.
4694 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4695 return Op;
4696
4697 assert(ArgVT == MVT::i16);
4698 SDLoc DL(Op);
4699
4700 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4701 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4702 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4703}
4704
4705static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4706 if (OrigVT.getSizeInBits() >= 64)
4707 return OrigVT;
4708
4709 assert(OrigVT.isSimple() && "Expecting a simple value type");
4710
4711 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4712 switch (OrigSimpleTy) {
4713 default: llvm_unreachable("Unexpected Vector Type");
4714 case MVT::v2i8:
4715 case MVT::v2i16:
4716 return MVT::v2i32;
4717 case MVT::v4i8:
4718 return MVT::v4i16;
4719 }
4720}
4721
4723 const EVT &OrigTy,
4724 const EVT &ExtTy,
4725 unsigned ExtOpcode) {
4726 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4727 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4728 // 64-bits we need to insert a new extension so that it will be 64-bits.
4729 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4730 if (OrigTy.getSizeInBits() >= 64)
4731 return N;
4732
4733 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4734 EVT NewVT = getExtensionTo64Bits(OrigTy);
4735
4736 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4737}
4738
4739// Returns lane if Op extracts from a two-element vector and lane is constant
4740// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4741static std::optional<uint64_t>
4743 SDNode *OpNode = Op.getNode();
4744 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4745 return std::nullopt;
4746
4747 EVT VT = OpNode->getOperand(0).getValueType();
4748 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4749 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4750 return std::nullopt;
4751
4752 return C->getZExtValue();
4753}
4754
4756 bool isSigned) {
4757 EVT VT = N.getValueType();
4758
4759 if (N.getOpcode() != ISD::BUILD_VECTOR)
4760 return false;
4761
4762 for (const SDValue &Elt : N->op_values()) {
4763 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4764 unsigned EltSize = VT.getScalarSizeInBits();
4765 unsigned HalfSize = EltSize / 2;
4766 if (isSigned) {
4767 if (!isIntN(HalfSize, C->getSExtValue()))
4768 return false;
4769 } else {
4770 if (!isUIntN(HalfSize, C->getZExtValue()))
4771 return false;
4772 }
4773 continue;
4774 }
4775 return false;
4776 }
4777
4778 return true;
4779}
4780
4782 EVT VT = N.getValueType();
4783 assert(VT.is128BitVector() && "Unexpected vector MULL size");
4784
4785 unsigned NumElts = VT.getVectorNumElements();
4786 unsigned OrigEltSize = VT.getScalarSizeInBits();
4787 unsigned EltSize = OrigEltSize / 2;
4788 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
4789
4790 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
4791 if (DAG.MaskedValueIsZero(N, HiBits))
4792 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
4793
4794 if (ISD::isExtOpcode(N.getOpcode()))
4795 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
4796 N.getOperand(0).getValueType(), VT,
4797 N.getOpcode());
4798
4799 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4800 SDLoc dl(N);
4802 for (unsigned i = 0; i != NumElts; ++i) {
4803 const APInt &CInt = N.getConstantOperandAPInt(i);
4804 // Element types smaller than 32 bits are not legal, so use i32 elements.
4805 // The values are implicitly truncated so sext vs. zext doesn't matter.
4806 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4807 }
4808 return DAG.getBuildVector(TruncVT, dl, Ops);
4809}
4810
4812 return N.getOpcode() == ISD::SIGN_EXTEND ||
4813 N.getOpcode() == ISD::ANY_EXTEND ||
4814 isExtendedBUILD_VECTOR(N, DAG, true);
4815}
4816
4818 return N.getOpcode() == ISD::ZERO_EXTEND ||
4819 N.getOpcode() == ISD::ANY_EXTEND ||
4820 isExtendedBUILD_VECTOR(N, DAG, false);
4821}
4822
4824 unsigned Opcode = N.getOpcode();
4825 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4826 SDValue N0 = N.getOperand(0);
4827 SDValue N1 = N.getOperand(1);
4828 return N0->hasOneUse() && N1->hasOneUse() &&
4829 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4830 }
4831 return false;
4832}
4833
4835 unsigned Opcode = N.getOpcode();
4836 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4837 SDValue N0 = N.getOperand(0);
4838 SDValue N1 = N.getOperand(1);
4839 return N0->hasOneUse() && N1->hasOneUse() &&
4840 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4841 }
4842 return false;
4843}
4844
4845SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
4846 SelectionDAG &DAG) const {
4847 // The rounding mode is in bits 23:22 of the FPSCR.
4848 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4849 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4850 // so that the shift + and get folded into a bitfield extract.
4851 SDLoc dl(Op);
4852
4853 SDValue Chain = Op.getOperand(0);
4854 SDValue FPCR_64 = DAG.getNode(
4855 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4856 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4857 Chain = FPCR_64.getValue(1);
4858 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4859 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4860 DAG.getConstant(1U << 22, dl, MVT::i32));
4861 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4862 DAG.getConstant(22, dl, MVT::i32));
4863 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4864 DAG.getConstant(3, dl, MVT::i32));
4865 return DAG.getMergeValues({AND, Chain}, dl);
4866}
4867
4868SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4869 SelectionDAG &DAG) const {
4870 SDLoc DL(Op);
4871 SDValue Chain = Op->getOperand(0);
4872 SDValue RMValue = Op->getOperand(1);
4873
4874 // The rounding mode is in bits 23:22 of the FPCR.
4875 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4876 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4877 // ((arg - 1) & 3) << 22).
4878 //
4879 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4880 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4881 // generated llvm.set.rounding to ensure this condition.
4882
4883 // Calculate new value of FPCR[23:22].
4884 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4885 DAG.getConstant(1, DL, MVT::i32));
4886 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4887 DAG.getConstant(0x3, DL, MVT::i32));
4888 RMValue =
4889 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4890 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4891 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4892
4893 // Get current value of FPCR.
4894 SDValue Ops[] = {
4895 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4896 SDValue FPCR =
4897 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4898 Chain = FPCR.getValue(1);
4899 FPCR = FPCR.getValue(0);
4900
4901 // Put new rounding mode into FPSCR[23:22].
4902 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4903 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4904 DAG.getConstant(RMMask, DL, MVT::i64));
4905 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4906 SDValue Ops2[] = {
4907 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4908 FPCR};
4909 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4910}
4911
4912SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
4913 SelectionDAG &DAG) const {
4914 SDLoc DL(Op);
4915 SDValue Chain = Op->getOperand(0);
4916
4917 // Get current value of FPCR.
4918 SDValue Ops[] = {
4919 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4920 SDValue FPCR =
4921 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4922 Chain = FPCR.getValue(1);
4923 FPCR = FPCR.getValue(0);
4924
4925 // Truncate FPCR to 32 bits.
4926 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
4927
4928 return DAG.getMergeValues({Result, Chain}, DL);
4929}
4930
4931SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
4932 SelectionDAG &DAG) const {
4933 SDLoc DL(Op);
4934 SDValue Chain = Op->getOperand(0);
4935 SDValue Mode = Op->getOperand(1);
4936
4937 // Extend the specified value to 64 bits.
4938 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
4939
4940 // Set new value of FPCR.
4941 SDValue Ops2[] = {
4942 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
4943 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4944}
4945
4946SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
4947 SelectionDAG &DAG) const {
4948 SDLoc DL(Op);
4949 SDValue Chain = Op->getOperand(0);
4950
4951 // Get current value of FPCR.
4952 SDValue Ops[] = {
4953 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4954 SDValue FPCR =
4955 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4956 Chain = FPCR.getValue(1);
4957 FPCR = FPCR.getValue(0);
4958
4959 // Clear bits that are not reserved.
4960 SDValue FPSCRMasked = DAG.getNode(
4961 ISD::AND, DL, MVT::i64, FPCR,
4963
4964 // Set new value of FPCR.
4965 SDValue Ops2[] = {Chain,
4966 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4967 FPSCRMasked};
4968 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4969}
4970
4971static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
4972 SDLoc DL, bool &IsMLA) {
4973 bool IsN0SExt = isSignExtended(N0, DAG);
4974 bool IsN1SExt = isSignExtended(N1, DAG);
4975 if (IsN0SExt && IsN1SExt)
4976 return AArch64ISD::SMULL;
4977
4978 bool IsN0ZExt = isZeroExtended(N0, DAG);
4979 bool IsN1ZExt = isZeroExtended(N1, DAG);
4980
4981 if (IsN0ZExt && IsN1ZExt)
4982 return AArch64ISD::UMULL;
4983
4984 // Select SMULL if we can replace zext with sext.
4985 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
4986 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
4987 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
4988 SDValue ZextOperand;
4989 if (IsN0ZExt)
4990 ZextOperand = N0.getOperand(0);
4991 else
4992 ZextOperand = N1.getOperand(0);
4993 if (DAG.SignBitIsZero(ZextOperand)) {
4994 SDValue NewSext =
4995 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
4996 if (IsN0ZExt)
4997 N0 = NewSext;
4998 else
4999 N1 = NewSext;
5000 return AArch64ISD::SMULL;
5001 }
5002 }
5003
5004 // Select UMULL if we can replace the other operand with an extend.
5005 if (IsN0ZExt || IsN1ZExt) {
5006 EVT VT = N0.getValueType();
5008 VT.getScalarSizeInBits() / 2);
5009 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5010 return AArch64ISD::UMULL;
5011 }
5012
5013 if (!IsN1SExt && !IsN1ZExt)
5014 return 0;
5015
5016 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5017 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5018 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5019 IsMLA = true;
5020 return AArch64ISD::SMULL;
5021 }
5022 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5023 IsMLA = true;
5024 return AArch64ISD::UMULL;
5025 }
5026 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5027 std::swap(N0, N1);
5028 IsMLA = true;
5029 return AArch64ISD::UMULL;
5030 }
5031 return 0;
5032}
5033
5034SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5035 EVT VT = Op.getValueType();
5036
5037 bool OverrideNEON = !Subtarget->isNeonAvailable();
5038 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5039 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5040
5041 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5042 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5043 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5044 "unexpected type for custom-lowering ISD::MUL");
5045 SDValue N0 = Op.getOperand(0);
5046 SDValue N1 = Op.getOperand(1);
5047 bool isMLA = false;
5048 EVT OVT = VT;
5049 if (VT.is64BitVector()) {
5050 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5051 isNullConstant(N0.getOperand(1)) &&
5053 isNullConstant(N1.getOperand(1))) {
5054 N0 = N0.getOperand(0);
5055 N1 = N1.getOperand(0);
5056 VT = N0.getValueType();
5057 } else {
5058 if (VT == MVT::v1i64) {
5059 if (Subtarget->hasSVE())
5060 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5061 // Fall through to expand this. It is not legal.
5062 return SDValue();
5063 } else
5064 // Other vector multiplications are legal.
5065 return Op;
5066 }
5067 }
5068
5069 SDLoc DL(Op);
5070 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5071
5072 if (!NewOpc) {
5073 if (VT.getVectorElementType() == MVT::i64) {
5074 // If SVE is available then i64 vector multiplications can also be made
5075 // legal.
5076 if (Subtarget->hasSVE())
5077 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5078 // Fall through to expand this. It is not legal.
5079 return SDValue();
5080 } else
5081 // Other vector multiplications are legal.
5082 return Op;
5083 }
5084
5085 // Legalize to a S/UMULL instruction
5086 SDValue Op0;
5087 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5088 if (!isMLA) {
5089 Op0 = skipExtensionForVectorMULL(N0, DAG);
5091 Op1.getValueType().is64BitVector() &&
5092 "unexpected types for extended operands to VMULL");
5093 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5094 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5095 DAG.getConstant(0, DL, MVT::i64));
5096 }
5097 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5098 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5099 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5102 EVT Op1VT = Op1.getValueType();
5103 return DAG.getNode(
5105 DAG.getNode(N0.getOpcode(), DL, VT,
5106 DAG.getNode(NewOpc, DL, VT,
5107 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5108 DAG.getNode(NewOpc, DL, VT,
5109 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5110 DAG.getConstant(0, DL, MVT::i64));
5111}
5112
5113static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5114 int Pattern) {
5115 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5116 return DAG.getConstant(1, DL, MVT::nxv1i1);
5117 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5118 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5119}
5120
5122 bool IsSigned, bool IsEqual) {
5123 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5124 !isa<ConstantSDNode>(Op.getOperand(2)))
5125 return SDValue();
5126
5127 SDLoc dl(Op);
5128 APInt X = Op.getConstantOperandAPInt(1);
5129 APInt Y = Op.getConstantOperandAPInt(2);
5130 bool Overflow;
5131 APInt NumActiveElems =
5132 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5133
5134 if (Overflow)
5135 return SDValue();
5136
5137 if (IsEqual) {
5138 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5139 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5140 : NumActiveElems.uadd_ov(One, Overflow);
5141 if (Overflow)
5142 return SDValue();
5143 }
5144
5145 std::optional<unsigned> PredPattern =
5147 unsigned MinSVEVectorSize = std::max(
5149 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5150 if (PredPattern != std::nullopt &&
5151 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5152 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5153
5154 return SDValue();
5155}
5156
5157// Returns a safe bitcast between two scalable vector predicates, where
5158// any newly created lanes from a widening bitcast are defined as zero.
5160 SDLoc DL(Op);
5161 EVT InVT = Op.getValueType();
5162
5163 assert(InVT.getVectorElementType() == MVT::i1 &&
5164 VT.getVectorElementType() == MVT::i1 &&
5165 "Expected a predicate-to-predicate bitcast");
5167 InVT.isScalableVector() &&
5168 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5169 "Only expect to cast between legal scalable predicate types!");
5170
5171 // Return the operand if the cast isn't changing type,
5172 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5173 if (InVT == VT)
5174 return Op;
5175
5176 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5177
5178 // We only have to zero the lanes if new lanes are being defined, e.g. when
5179 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5180 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5181 // we can return here.
5182 if (InVT.bitsGT(VT))
5183 return Reinterpret;
5184
5185 // Check if the other lanes are already known to be zeroed by
5186 // construction.
5188 return Reinterpret;
5189
5190 // Zero the newly introduced lanes.
5191 SDValue Mask = DAG.getConstant(1, DL, InVT);
5192 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5193 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5194}
5195
5196SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5197 SDValue Chain, SDLoc DL,
5198 EVT VT) const {
5199 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5201 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5202 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5205 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5207 RetTy, Callee, std::move(Args));
5208 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5209 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5210 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5211 Mask);
5212}
5213
5214// Lower an SME LDR/STR ZA intrinsic
5215// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5216// folded into the instruction
5217// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5218// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5219// and tile slice registers
5220// ldr(%tileslice, %ptr, %vecnum)
5221// ->
5222// %svl = rdsvl
5223// %ptr2 = %ptr + %svl * %vecnum
5224// %tileslice2 = %tileslice + %vecnum
5225// ldr [%tileslice2, 0], [%ptr2, 0]
5226// Case 3: If the vecnum is an immediate out of range, then the same is done as
5227// case 2, but the base and slice registers are modified by the greatest
5228// multiple of 15 lower than the vecnum and the remainder is folded into the
5229// instruction. This means that successive loads and stores that are offset from
5230// each other can share the same base and slice register updates.
5231// ldr(%tileslice, %ptr, 22)
5232// ldr(%tileslice, %ptr, 23)
5233// ->
5234// %svl = rdsvl
5235// %ptr2 = %ptr + %svl * 15
5236// %tileslice2 = %tileslice + 15
5237// ldr [%tileslice2, 7], [%ptr2, 7]
5238// ldr [%tileslice2, 8], [%ptr2, 8]
5239// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5240// operand and the immediate can be folded into the instruction, like case 2.
5241// ldr(%tileslice, %ptr, %vecnum + 7)
5242// ldr(%tileslice, %ptr, %vecnum + 8)
5243// ->
5244// %svl = rdsvl
5245// %ptr2 = %ptr + %svl * %vecnum
5246// %tileslice2 = %tileslice + %vecnum
5247// ldr [%tileslice2, 7], [%ptr2, 7]
5248// ldr [%tileslice2, 8], [%ptr2, 8]
5249// Case 5: The vecnum being an add of an immediate out of range is also handled,
5250// in which case the same remainder logic as case 3 is used.
5252 SDLoc DL(N);
5253
5254 SDValue TileSlice = N->getOperand(2);
5255 SDValue Base = N->getOperand(3);
5256 SDValue VecNum = N->getOperand(4);
5257 int32_t ConstAddend = 0;
5258 SDValue VarAddend = VecNum;
5259
5260 // If the vnum is an add of an immediate, we can fold it into the instruction
5261 if (VecNum.getOpcode() == ISD::ADD &&
5262 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5263 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5264 VarAddend = VecNum.getOperand(0);
5265 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5266 ConstAddend = ImmNode->getSExtValue();
5267 VarAddend = SDValue();
5268 }
5269
5270 int32_t ImmAddend = ConstAddend % 16;
5271 if (int32_t C = (ConstAddend - ImmAddend)) {
5272 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5273 VarAddend = VarAddend
5274 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5275 : CVal;
5276 }
5277
5278 if (VarAddend) {
5279 // Get the vector length that will be multiplied by vnum
5280 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5281 DAG.getConstant(1, DL, MVT::i32));
5282
5283 // Multiply SVL and vnum then add it to the base
5284 SDValue Mul = DAG.getNode(
5285 ISD::MUL, DL, MVT::i64,
5286 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5287 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5288 // Just add vnum to the tileslice
5289 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5290 }
5291
5293 DL, MVT::Other,
5294 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5295 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5296}
5297
5298SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5299 SelectionDAG &DAG) const {
5300 unsigned IntNo = Op.getConstantOperandVal(1);
5301 SDLoc DL(Op);
5302 switch (IntNo) {
5303 default:
5304 return SDValue(); // Don't custom lower most intrinsics.
5305 case Intrinsic::aarch64_prefetch: {
5306 SDValue Chain = Op.getOperand(0);
5307 SDValue Addr = Op.getOperand(2);
5308
5309 unsigned IsWrite = Op.getConstantOperandVal(3);
5310 unsigned Locality = Op.getConstantOperandVal(4);
5311 unsigned IsStream = Op.getConstantOperandVal(5);
5312 unsigned IsData = Op.getConstantOperandVal(6);
5313 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5314 (!IsData << 3) | // IsDataCache bit
5315 (Locality << 1) | // Cache level bits
5316 (unsigned)IsStream; // Stream bit
5317
5318 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5319 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5320 }
5321 case Intrinsic::aarch64_sme_str:
5322 case Intrinsic::aarch64_sme_ldr: {
5323 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5324 }
5325 case Intrinsic::aarch64_sme_za_enable:
5326 return DAG.getNode(
5327 AArch64ISD::SMSTART, DL, MVT::Other,
5328 Op->getOperand(0), // Chain
5329 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5330 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5331 case Intrinsic::aarch64_sme_za_disable:
5332 return DAG.getNode(
5333 AArch64ISD::SMSTOP, DL, MVT::Other,
5334 Op->getOperand(0), // Chain
5335 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5336 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5337 }
5338}
5339
5340SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5341 SelectionDAG &DAG) const {
5342 unsigned IntNo = Op.getConstantOperandVal(1);
5343 SDLoc DL(Op);
5344 switch (IntNo) {
5345 default:
5346 return SDValue(); // Don't custom lower most intrinsics.
5347 case Intrinsic::aarch64_mops_memset_tag: {
5348 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5349 SDValue Chain = Node->getChain();
5350 SDValue Dst = Op.getOperand(2);
5351 SDValue Val = Op.getOperand(3);
5352 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5353 SDValue Size = Op.getOperand(4);
5354 auto Alignment = Node->getMemOperand()->getAlign();
5355 bool IsVol = Node->isVolatile();
5356 auto DstPtrInfo = Node->getPointerInfo();
5357
5358 const auto &SDI =
5359 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5360 SDValue MS =
5361 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5362 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5363
5364 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5365 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5366 // LowerOperationWrapper will complain that the number of results has
5367 // changed.
5368 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5369 }
5370 }
5371}
5372
5373SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5374 SelectionDAG &DAG) const {
5375 unsigned IntNo = Op.getConstantOperandVal(0);
5376 SDLoc dl(Op);
5377 switch (IntNo) {
5378 default: return SDValue(); // Don't custom lower most intrinsics.
5379 case Intrinsic::thread_pointer: {
5380 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5381 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5382 }
5383 case Intrinsic::aarch64_neon_abs: {
5384 EVT Ty = Op.getValueType();
5385 if (Ty == MVT::i64) {
5386 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5387 Op.getOperand(1));
5388 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5389 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5390 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5391 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5392 } else {
5393 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5394 }
5395 }
5396 case Intrinsic::aarch64_neon_pmull64: {
5397 SDValue LHS = Op.getOperand(1);
5398 SDValue RHS = Op.getOperand(2);
5399
5400 std::optional<uint64_t> LHSLane =
5402 std::optional<uint64_t> RHSLane =
5404
5405 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5406 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5407
5408 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5409 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5410 // which ISel recognizes better. For example, generate a ldr into d*
5411 // registers as opposed to a GPR load followed by a fmov.
5412 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5413 std::optional<uint64_t> OtherLane,
5414 const SDLoc &dl,
5415 SelectionDAG &DAG) -> SDValue {
5416 // If the operand is an higher half itself, rewrite it to
5417 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5418 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5419 if (NLane && *NLane == 1)
5420 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5421 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5422
5423 // Operand N is not a higher half but the other operand is.
5424 if (OtherLane && *OtherLane == 1) {
5425 // If this operand is a lower half, rewrite it to
5426 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5427 // align lanes of two operands. A roundtrip sequence (to move from lane
5428 // 1 to lane 0) is like this:
5429 // mov x8, v0.d[1]
5430 // fmov d0, x8
5431 if (NLane && *NLane == 0)
5432 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5433 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5434 N.getOperand(0),
5435 DAG.getConstant(0, dl, MVT::i64)),
5436 DAG.getConstant(1, dl, MVT::i64));
5437
5438 // Otherwise just dup from main to all lanes.
5439 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5440 }
5441
5442 // Neither operand is an extract of higher half, so codegen may just use
5443 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5444 assert(N.getValueType() == MVT::i64 &&
5445 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5446 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5447 };
5448
5449 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5450 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5451
5452 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5453 }
5454 case Intrinsic::aarch64_neon_smax:
5455 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5456 Op.getOperand(1), Op.getOperand(2));
5457 case Intrinsic::aarch64_neon_umax:
5458 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5459 Op.getOperand(1), Op.getOperand(2));
5460 case Intrinsic::aarch64_neon_smin:
5461 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5462 Op.getOperand(1), Op.getOperand(2));
5463 case Intrinsic::aarch64_neon_umin:
5464 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5465 Op.getOperand(1), Op.getOperand(2));
5466 case Intrinsic::aarch64_neon_scalar_sqxtn:
5467 case Intrinsic::aarch64_neon_scalar_sqxtun:
5468 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5469 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5470 if (Op.getValueType() == MVT::i32)
5471 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5472 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5473 Op.getOperand(0),
5474 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5475 Op.getOperand(1))));
5476 return SDValue();
5477 }
5478 case Intrinsic::aarch64_sve_whilelo:
5479 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5480 /*IsEqual=*/false);
5481 case Intrinsic::aarch64_sve_whilelt:
5482 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5483 /*IsEqual=*/false);
5484 case Intrinsic::aarch64_sve_whilels:
5485 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5486 /*IsEqual=*/true);
5487 case Intrinsic::aarch64_sve_whilele:
5488 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5489 /*IsEqual=*/true);
5490 case Intrinsic::aarch64_sve_sunpkhi:
5491 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5492 Op.getOperand(1));
5493 case Intrinsic::aarch64_sve_sunpklo:
5494 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5495 Op.getOperand(1));
5496 case Intrinsic::aarch64_sve_uunpkhi:
5497 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5498 Op.getOperand(1));
5499 case Intrinsic::aarch64_sve_uunpklo:
5500 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5501 Op.getOperand(1));
5502 case Intrinsic::aarch64_sve_clasta_n:
5503 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5504 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5505 case Intrinsic::aarch64_sve_clastb_n:
5506 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5507 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5508 case Intrinsic::aarch64_sve_lasta:
5509 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5510 Op.getOperand(1), Op.getOperand(2));
5511 case Intrinsic::aarch64_sve_lastb:
5512 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5513 Op.getOperand(1), Op.getOperand(2));
5514 case Intrinsic::aarch64_sve_rev:
5515 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5516 Op.getOperand(1));
5517 case Intrinsic::aarch64_sve_tbl:
5518 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5519 Op.getOperand(1), Op.getOperand(2));
5520 case Intrinsic::aarch64_sve_trn1:
5521 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5522 Op.getOperand(1), Op.getOperand(2));
5523 case Intrinsic::aarch64_sve_trn2:
5524 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5525 Op.getOperand(1), Op.getOperand(2));
5526 case Intrinsic::aarch64_sve_uzp1:
5527 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5528 Op.getOperand(1), Op.getOperand(2));
5529 case Intrinsic::aarch64_sve_uzp2:
5530 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5531 Op.getOperand(1), Op.getOperand(2));
5532 case Intrinsic::aarch64_sve_zip1:
5533 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5534 Op.getOperand(1), Op.getOperand(2));
5535 case Intrinsic::aarch64_sve_zip2:
5536 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5537 Op.getOperand(1), Op.getOperand(2));
5538 case Intrinsic::aarch64_sve_splice:
5539 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5540 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5541 case Intrinsic::aarch64_sve_ptrue:
5542 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5543 case Intrinsic::aarch64_sve_clz:
5544 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5545 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5546 case Intrinsic::aarch64_sme_cntsb:
5547 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5548 DAG.getConstant(1, dl, MVT::i32));
5549 case Intrinsic::aarch64_sme_cntsh: {
5550 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5551 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5552 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5553 }
5554 case Intrinsic::aarch64_sme_cntsw: {
5555 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5556 DAG.getConstant(1, dl, MVT::i32));
5557 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5558 DAG.getConstant(2, dl, MVT::i32));
5559 }
5560 case Intrinsic::aarch64_sme_cntsd: {
5561 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5562 DAG.getConstant(1, dl, MVT::i32));
5563 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5564 DAG.getConstant(3, dl, MVT::i32));
5565 }
5566 case Intrinsic::aarch64_sve_cnt: {
5567 SDValue Data = Op.getOperand(3);
5568 // CTPOP only supports integer operands.
5569 if (Data.getValueType().isFloatingPoint())
5570 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5571 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5572 Op.getOperand(2), Data, Op.getOperand(1));
5573 }
5574 case Intrinsic::aarch64_sve_dupq_lane:
5575 return LowerDUPQLane(Op, DAG);
5576 case Intrinsic::aarch64_sve_convert_from_svbool:
5577 if (Op.getValueType() == MVT::aarch64svcount)
5578 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5579 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5580 case Intrinsic::aarch64_sve_convert_to_svbool:
5581 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5582 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5583 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5584 case Intrinsic::aarch64_sve_fneg:
5585 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5586 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5587 case Intrinsic::aarch64_sve_frintp:
5588 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5589 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5590 case Intrinsic::aarch64_sve_frintm:
5591 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5592 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5593 case Intrinsic::aarch64_sve_frinti:
5594 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5595 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5596 case Intrinsic::aarch64_sve_frintx:
5597 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5598 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5599 case Intrinsic::aarch64_sve_frinta:
5600 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5601 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5602 case Intrinsic::aarch64_sve_frintn:
5603 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5604 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5605 case Intrinsic::aarch64_sve_frintz:
5606 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5607 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5608 case Intrinsic::aarch64_sve_ucvtf:
5610 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5611 Op.getOperand(1));
5612 case Intrinsic::aarch64_sve_scvtf:
5614 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5615 Op.getOperand(1));
5616 case Intrinsic::aarch64_sve_fcvtzu:
5618 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5619 Op.getOperand(1));
5620 case Intrinsic::aarch64_sve_fcvtzs:
5622 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5623 Op.getOperand(1));
5624 case Intrinsic::aarch64_sve_fsqrt:
5625 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5626 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5627 case Intrinsic::aarch64_sve_frecpx:
5628 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5629 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5630 case Intrinsic::aarch64_sve_frecpe_x:
5631 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5632 Op.getOperand(1));
5633 case Intrinsic::aarch64_sve_frecps_x:
5634 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5635 Op.getOperand(1), Op.getOperand(2));
5636 case Intrinsic::aarch64_sve_frsqrte_x:
5637 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5638 Op.getOperand(1));
5639 case Intrinsic::aarch64_sve_frsqrts_x:
5640 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5641 Op.getOperand(1), Op.getOperand(2));
5642 case Intrinsic::aarch64_sve_fabs:
5643 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5644 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5645 case Intrinsic::aarch64_sve_abs:
5646 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5647 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5648 case Intrinsic::aarch64_sve_neg:
5649 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5650 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5651 case Intrinsic::aarch64_sve_insr: {
5652 SDValue Scalar = Op.getOperand(2);
5653 EVT ScalarTy = Scalar.getValueType();
5654 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5655 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5656
5657 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5658 Op.getOperand(1), Scalar);
5659 }
5660 case Intrinsic::aarch64_sve_rbit:
5662 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5663 Op.getOperand(1));
5664 case Intrinsic::aarch64_sve_revb:
5665 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5666 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5667 case Intrinsic::aarch64_sve_revh:
5668 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5669 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5670 case Intrinsic::aarch64_sve_revw:
5671 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5672 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5673 case Intrinsic::aarch64_sve_revd:
5674 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5675 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5676 case Intrinsic::aarch64_sve_sxtb:
5677 return DAG.getNode(
5679 Op.getOperand(2), Op.getOperand(3),
5680 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5681 Op.getOperand(1));
5682 case Intrinsic::aarch64_sve_sxth:
5683 return DAG.getNode(
5685 Op.getOperand(2), Op.getOperand(3),
5686 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5687 Op.getOperand(1));
5688 case Intrinsic::aarch64_sve_sxtw:
5689 return DAG.getNode(
5691 Op.getOperand(2), Op.getOperand(3),
5692 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5693 Op.getOperand(1));
5694 case Intrinsic::aarch64_sve_uxtb:
5695 return DAG.getNode(
5697 Op.getOperand(2), Op.getOperand(3),
5698 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5699 Op.getOperand(1));
5700 case Intrinsic::aarch64_sve_uxth:
5701 return DAG.getNode(
5703 Op.getOperand(2), Op.getOperand(3),
5704 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5705 Op.getOperand(1));
5706 case Intrinsic::aarch64_sve_uxtw:
5707 return DAG.getNode(
5709 Op.getOperand(2), Op.getOperand(3),
5710 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5711 Op.getOperand(1));
5712 case Intrinsic::localaddress: {
5713 const auto &MF = DAG.getMachineFunction();
5714 const auto *RegInfo = Subtarget->getRegisterInfo();
5715 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5716 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5717 Op.getSimpleValueType());
5718 }
5719
5720 case Intrinsic::eh_recoverfp: {
5721 // FIXME: This needs to be implemented to correctly handle highly aligned
5722 // stack objects. For now we simply return the incoming FP. Refer D53541
5723 // for more details.
5724 SDValue FnOp = Op.getOperand(1);
5725 SDValue IncomingFPOp = Op.getOperand(2);
5726 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5727 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5728 if (!Fn)
5730 "llvm.eh.recoverfp must take a function as the first argument");
5731 return IncomingFPOp;
5732 }
5733
5734 case Intrinsic::aarch64_neon_vsri:
5735 case Intrinsic::aarch64_neon_vsli:
5736 case Intrinsic::aarch64_sve_sri:
5737 case Intrinsic::aarch64_sve_sli: {
5738 EVT Ty = Op.getValueType();
5739
5740 if (!Ty.isVector())
5741 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5742
5743 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5744
5745 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5746 IntNo == Intrinsic::aarch64_sve_sri;
5747 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5748 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5749 Op.getOperand(3));
5750 }
5751
5752 case Intrinsic::aarch64_neon_srhadd:
5753 case Intrinsic::aarch64_neon_urhadd:
5754 case Intrinsic::aarch64_neon_shadd:
5755 case Intrinsic::aarch64_neon_uhadd: {
5756 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5757 IntNo == Intrinsic::aarch64_neon_shadd);
5758 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
5759 IntNo == Intrinsic::aarch64_neon_urhadd);
5760 unsigned Opcode = IsSignedAdd
5761 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
5762 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
5763 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5764 Op.getOperand(2));
5765 }
5766 case Intrinsic::aarch64_neon_saddlp:
5767 case Intrinsic::aarch64_neon_uaddlp: {
5768 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
5771 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
5772 }
5773 case Intrinsic::aarch64_neon_sdot:
5774 case Intrinsic::aarch64_neon_udot:
5775 case Intrinsic::aarch64_sve_sdot:
5776 case Intrinsic::aarch64_sve_udot: {
5777 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
5778 IntNo == Intrinsic::aarch64_sve_udot)
5781 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
5782 Op.getOperand(2), Op.getOperand(3));
5783 }
5784 case Intrinsic::get_active_lane_mask: {
5785 SDValue ID =
5786 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
5787
5788 EVT VT = Op.getValueType();
5789 if (VT.isScalableVector())
5790 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
5791 Op.getOperand(2));
5792
5793 // We can use the SVE whilelo instruction to lower this intrinsic by
5794 // creating the appropriate sequence of scalable vector operations and
5795 // then extracting a fixed-width subvector from the scalable vector.
5796
5797 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
5798 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5799
5800 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
5801 Op.getOperand(1), Op.getOperand(2));
5802 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
5803 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
5804 DAG.getVectorIdxConstant(0, dl));
5805 }
5806 case Intrinsic::aarch64_neon_uaddlv: {
5807 EVT OpVT = Op.getOperand(1).getValueType();
5808 EVT ResVT = Op.getValueType();
5809 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
5810 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
5811 // In order to avoid insert_subvector, used v4i32 than v2i32.
5812 SDValue UADDLV =
5813 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
5814 SDValue EXTRACT_VEC_ELT =
5815 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
5816 DAG.getConstant(0, dl, MVT::i64));
5817 return EXTRACT_VEC_ELT;
5818 }
5819 return SDValue();
5820 }
5821 case Intrinsic::experimental_cttz_elts: {
5822 SDValue NewCttzElts =
5823 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
5824
5825 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
5826 }
5827 }
5828}
5829
5830bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
5831 if (VT.getVectorElementType() == MVT::i8 ||
5832 VT.getVectorElementType() == MVT::i16) {
5833 EltTy = MVT::i32;
5834 return true;
5835 }
5836 return false;
5837}
5838
5839bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
5840 EVT DataVT) const {
5841 const EVT IndexVT = Extend.getOperand(0).getValueType();
5842 // SVE only supports implicit extension of 32-bit indices.
5843 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
5844 return false;
5845
5846 // Indices cannot be smaller than the main data type.
5847 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
5848 return false;
5849
5850 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
5851 // element container type, which would violate the previous clause.
5852 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
5853}
5854
5855bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
5856 EVT ExtVT = ExtVal.getValueType();
5857 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
5858 return false;
5859
5860 // It may be worth creating extending masked loads if there are multiple
5861 // masked loads using the same predicate. That way we'll end up creating
5862 // extending masked loads that may then get split by the legaliser. This
5863 // results in just one set of predicate unpacks at the start, instead of
5864 // multiple sets of vector unpacks after each load.
5865 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
5866 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
5867 // Disable extending masked loads for fixed-width for now, since the code
5868 // quality doesn't look great.
5869 if (!ExtVT.isScalableVector())
5870 return false;
5871
5872 unsigned NumExtMaskedLoads = 0;
5873 for (auto *U : Ld->getMask()->uses())
5874 if (isa<MaskedLoadSDNode>(U))
5875 NumExtMaskedLoads++;
5876
5877 if (NumExtMaskedLoads <= 1)
5878 return false;
5879 }
5880 }
5881
5882 return true;
5883}
5884
5885unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
5886 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
5887 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
5889 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
5891 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
5893 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
5895 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
5897 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
5899 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
5901 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
5903 };
5904 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
5905 return AddrModes.find(Key)->second;
5906}
5907
5908unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
5909 switch (Opcode) {
5910 default:
5911 llvm_unreachable("unimplemented opcode");
5912 return Opcode;
5927 }
5928}
5929
5930SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
5931 SelectionDAG &DAG) const {
5932 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
5933
5934 SDLoc DL(Op);
5935 SDValue Chain = MGT->getChain();
5936 SDValue PassThru = MGT->getPassThru();
5937 SDValue Mask = MGT->getMask();
5938 SDValue BasePtr = MGT->getBasePtr();
5939 SDValue Index = MGT->getIndex();
5940 SDValue Scale = MGT->getScale();
5941 EVT VT = Op.getValueType();
5942 EVT MemVT = MGT->getMemoryVT();
5943 ISD::LoadExtType ExtType = MGT->getExtensionType();
5944 ISD::MemIndexType IndexType = MGT->getIndexType();
5945
5946 // SVE supports zero (and so undef) passthrough values only, everything else
5947 // must be handled manually by an explicit select on the load's output.
5948 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
5949 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
5950 SDValue Load =
5951 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5952 MGT->getMemOperand(), IndexType, ExtType);
5953 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5954 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
5955 }
5956
5957 bool IsScaled = MGT->isIndexScaled();
5958 bool IsSigned = MGT->isIndexSigned();
5959
5960 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5961 // must be calculated before hand.
5962 uint64_t ScaleVal = Scale->getAsZExtVal();
5963 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5964 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5965 EVT IndexVT = Index.getValueType();
5966 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5967 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5968 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5969
5970 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5971 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5972 MGT->getMemOperand(), IndexType, ExtType);
5973 }
5974
5975 // Lower fixed length gather to a scalable equivalent.
5976 if (VT.isFixedLengthVector()) {
5977 assert(Subtarget->useSVEForFixedLengthVectors() &&
5978 "Cannot lower when not using SVE for fixed vectors!");
5979
5980 // NOTE: Handle floating-point as if integer then bitcast the result.
5982 MemVT = MemVT.changeVectorElementTypeToInteger();
5983
5984 // Find the smallest integer fixed length vector we can use for the gather.
5985 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5986 if (DataVT.getVectorElementType() == MVT::i64 ||
5987 Index.getValueType().getVectorElementType() == MVT::i64 ||
5988 Mask.getValueType().getVectorElementType() == MVT::i64)
5989 PromotedVT = VT.changeVectorElementType(MVT::i64);
5990
5991 // Promote vector operands except for passthrough, which we know is either
5992 // undef or zero, and thus best constructed directly.
5993 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5994 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5995 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5996
5997 // A promoted result type forces the need for an extending load.
5998 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5999 ExtType = ISD::EXTLOAD;
6000
6001 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6002
6003 // Convert fixed length vector operands to scalable.
6004 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6005 Index = convertToScalableVector(DAG, ContainerVT, Index);
6007 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6008 : DAG.getConstant(0, DL, ContainerVT);
6009
6010 // Emit equivalent scalable vector gather.
6011 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6012 SDValue Load =
6013 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6014 Ops, MGT->getMemOperand(), IndexType, ExtType);
6015
6016 // Extract fixed length data then convert to the required result type.
6017 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6018 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6019 if (VT.isFloatingPoint())
6020 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6021
6022 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6023 }
6024
6025 // Everything else is legal.
6026 return Op;
6027}
6028
6029SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6030 SelectionDAG &DAG) const {
6031 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6032
6033 SDLoc DL(Op);
6034 SDValue Chain = MSC->getChain();
6035 SDValue StoreVal = MSC->getValue();
6036 SDValue Mask = MSC->getMask();
6037 SDValue BasePtr = MSC->getBasePtr();
6038 SDValue Index = MSC->getIndex();
6039 SDValue Scale = MSC->getScale();
6040 EVT VT = StoreVal.getValueType();
6041 EVT MemVT = MSC->getMemoryVT();
6042 ISD::MemIndexType IndexType = MSC->getIndexType();
6043 bool Truncating = MSC->isTruncatingStore();
6044
6045 bool IsScaled = MSC->isIndexScaled();
6046 bool IsSigned = MSC->isIndexSigned();
6047
6048 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6049 // must be calculated before hand.
6050 uint64_t ScaleVal = Scale->getAsZExtVal();
6051 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6052 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6053 EVT IndexVT = Index.getValueType();
6054 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6055 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6056 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6057
6058 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6059 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6060 MSC->getMemOperand(), IndexType, Truncating);
6061 }
6062
6063 // Lower fixed length scatter to a scalable equivalent.
6064 if (VT.isFixedLengthVector()) {
6065 assert(Subtarget->useSVEForFixedLengthVectors() &&
6066 "Cannot lower when not using SVE for fixed vectors!");
6067
6068 // Once bitcast we treat floating-point scatters as if integer.
6069 if (VT.isFloatingPoint()) {
6071 MemVT = MemVT.changeVectorElementTypeToInteger();
6072 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6073 }
6074
6075 // Find the smallest integer fixed length vector we can use for the scatter.
6076 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6077 if (VT.getVectorElementType() == MVT::i64 ||
6078 Index.getValueType().getVectorElementType() == MVT::i64 ||
6079 Mask.getValueType().getVectorElementType() == MVT::i64)
6080 PromotedVT = VT.changeVectorElementType(MVT::i64);
6081
6082 // Promote vector operands.
6083 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6084 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6085 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6086 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6087
6088 // A promoted value type forces the need for a truncating store.
6089 if (PromotedVT != VT)
6090 Truncating = true;
6091
6092 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6093
6094 // Convert fixed length vector operands to scalable.
6095 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6096 Index = convertToScalableVector(DAG, ContainerVT, Index);
6098 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6099
6100 // Emit equivalent scalable vector scatter.
6101 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6102 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6103 MSC->getMemOperand(), IndexType, Truncating);
6104 }
6105
6106 // Everything else is legal.
6107 return Op;
6108}
6109
6110SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6111 SDLoc DL(Op);
6112 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6113 assert(LoadNode && "Expected custom lowering of a masked load node");
6114 EVT VT = Op->getValueType(0);
6115
6116 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6117 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6118
6119 SDValue PassThru = LoadNode->getPassThru();
6120 SDValue Mask = LoadNode->getMask();
6121
6122 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6123 return Op;
6124
6126 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6127 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6128 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6129 LoadNode->getExtensionType());
6130
6131 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6132
6133 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6134}
6135
6136// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6138 EVT VT, EVT MemVT,
6139 SelectionDAG &DAG) {
6140 assert(VT.isVector() && "VT should be a vector type");
6141 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6142
6143 SDValue Value = ST->getValue();
6144
6145 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6146 // the word lane which represent the v4i8 subvector. It optimizes the store
6147 // to:
6148 //
6149 // xtn v0.8b, v0.8h
6150 // str s0, [x0]
6151
6152 SDValue Undef = DAG.getUNDEF(MVT::i16);
6153 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6154 {Undef, Undef, Undef, Undef});
6155
6156 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6157 Value, UndefVec);
6158 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6159
6160 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6161 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6162 Trunc, DAG.getConstant(0, DL, MVT::i64));
6163
6164 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6165 ST->getBasePtr(), ST->getMemOperand());
6166}
6167
6168// Custom lowering for any store, vector or scalar and/or default or with
6169// a truncate operations. Currently only custom lower truncate operation
6170// from vector v4i16 to v4i8 or volatile stores of i128.
6171SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6172 SelectionDAG &DAG) const {
6173 SDLoc Dl(Op);
6174 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6175 assert (StoreNode && "Can only custom lower store nodes");
6176
6177 SDValue Value = StoreNode->getValue();
6178
6179 EVT VT = Value.getValueType();
6180 EVT MemVT = StoreNode->getMemoryVT();
6181
6182 if (VT.isVector()) {
6184 VT,
6185 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6186 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6187
6188 unsigned AS = StoreNode->getAddressSpace();
6189 Align Alignment = StoreNode->getAlign();
6190 if (Alignment < MemVT.getStoreSize() &&
6191 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6192 StoreNode->getMemOperand()->getFlags(),
6193 nullptr)) {
6194 return scalarizeVectorStore(StoreNode, DAG);
6195 }
6196
6197 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6198 MemVT == MVT::v4i8) {
6199 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6200 }
6201 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6202 // the custom lowering, as there are no un-paired non-temporal stores and
6203 // legalization will break up 256 bit inputs.
6205 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6206 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6207 (MemVT.getScalarSizeInBits() == 8u ||
6208 MemVT.getScalarSizeInBits() == 16u ||
6209 MemVT.getScalarSizeInBits() == 32u ||
6210 MemVT.getScalarSizeInBits() == 64u)) {
6211 SDValue Lo =
6214 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6215 SDValue Hi =
6218 StoreNode->getValue(),
6219 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6221 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6222 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6223 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6224 return Result;
6225 }
6226 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6227 return LowerStore128(Op, DAG);
6228 } else if (MemVT == MVT::i64x8) {
6229 SDValue Value = StoreNode->getValue();
6230 assert(Value->getValueType(0) == MVT::i64x8);
6231 SDValue Chain = StoreNode->getChain();
6232 SDValue Base = StoreNode->getBasePtr();
6233 EVT PtrVT = Base.getValueType();
6234 for (unsigned i = 0; i < 8; i++) {
6235 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6236 Value, DAG.getConstant(i, Dl, MVT::i32));
6237 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6238 DAG.getConstant(i * 8, Dl, PtrVT));
6239 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6240 StoreNode->getOriginalAlign());
6241 }
6242 return Chain;
6243 }
6244
6245 return SDValue();
6246}
6247
6248/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6249SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6250 SelectionDAG &DAG) const {
6251 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6252 assert(StoreNode->getMemoryVT() == MVT::i128);
6253 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6254
6255 bool IsStoreRelease =
6257 if (StoreNode->isAtomic())
6258 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6259 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6262
6263 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6264 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6265 ? StoreNode->getOperand(1)
6266 : StoreNode->getOperand(2);
6267 SDLoc DL(Op);
6268 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6269 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6270 if (DAG.getDataLayout().isBigEndian())
6271 std::swap(StoreValue.first, StoreValue.second);
6273 Opcode, DL, DAG.getVTList(MVT::Other),
6274 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6275 StoreNode->getBasePtr()},
6276 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6277 return Result;
6278}
6279
6280SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6281 SelectionDAG &DAG) const {
6282 SDLoc DL(Op);
6283 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6284 assert(LoadNode && "Expected custom lowering of a load node");
6285
6286 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6288 SDValue Base = LoadNode->getBasePtr();
6289 SDValue Chain = LoadNode->getChain();
6290 EVT PtrVT = Base.getValueType();
6291 for (unsigned i = 0; i < 8; i++) {
6292 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6293 DAG.getConstant(i * 8, DL, PtrVT));
6294 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6295 LoadNode->getPointerInfo(),
6296 LoadNode->getOriginalAlign());
6297 Ops.push_back(Part);
6298 Chain = SDValue(Part.getNode(), 1);
6299 }
6300 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6301 return DAG.getMergeValues({Loaded, Chain}, DL);
6302 }
6303
6304 // Custom lowering for extending v4i8 vector loads.
6305 EVT VT = Op->getValueType(0);
6306 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6307
6308 if (LoadNode->getMemoryVT() != MVT::v4i8)
6309 return SDValue();
6310
6311 unsigned ExtType;
6312 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6313 ExtType = ISD::SIGN_EXTEND;
6314 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6315 LoadNode->getExtensionType() == ISD::EXTLOAD)
6316 ExtType = ISD::ZERO_EXTEND;
6317 else
6318 return SDValue();
6319
6320 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6321 LoadNode->getBasePtr(), MachinePointerInfo());
6322 SDValue Chain = Load.getValue(1);
6323 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6324 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6325 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6326 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6327 DAG.getConstant(0, DL, MVT::i64));
6328 if (VT == MVT::v4i32)
6329 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6330 return DAG.getMergeValues({Ext, Chain}, DL);
6331}
6332
6333// Generate SUBS and CSEL for integer abs.
6334SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6335 MVT VT = Op.getSimpleValueType();
6336
6337 if (VT.isVector())
6338 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6339
6340 SDLoc DL(Op);
6341 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6342 Op.getOperand(0));
6343 // Generate SUBS & CSEL.
6344 SDValue Cmp =
6345 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6346 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6347 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6348 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6349 Cmp.getValue(1));
6350}
6351
6353 SDValue Chain = Op.getOperand(0);
6354 SDValue Cond = Op.getOperand(1);
6355 SDValue Dest = Op.getOperand(2);
6356
6358 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6359 SDLoc dl(Op);
6360 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6361 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6362 Cmp);
6363 }
6364
6365 return SDValue();
6366}
6367
6368// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6369// FSHL is converted to FSHR before deciding what to do with it
6371 SDValue Shifts = Op.getOperand(2);
6372 // Check if the shift amount is a constant
6373 // If opcode is FSHL, convert it to FSHR
6374 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6375 SDLoc DL(Op);
6376 MVT VT = Op.getSimpleValueType();
6377
6378 if (Op.getOpcode() == ISD::FSHL) {
6379 unsigned int NewShiftNo =
6380 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6381 return DAG.getNode(
6382 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6383 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6384 } else if (Op.getOpcode() == ISD::FSHR) {
6385 return Op;
6386 }
6387 }
6388
6389 return SDValue();
6390}
6391
6393 SDValue X = Op.getOperand(0);
6394 EVT XScalarTy = X.getValueType();
6395 SDValue Exp = Op.getOperand(1);
6396
6397 SDLoc DL(Op);
6398 EVT XVT, ExpVT;
6399 switch (Op.getSimpleValueType().SimpleTy) {
6400 default:
6401 return SDValue();
6402 case MVT::bf16:
6403 case MVT::f16:
6404 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6405 [[fallthrough]];
6406 case MVT::f32:
6407 XVT = MVT::nxv4f32;
6408 ExpVT = MVT::nxv4i32;
6409 break;
6410 case MVT::f64:
6411 XVT = MVT::nxv2f64;
6412 ExpVT = MVT::nxv2i64;
6413 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6414 break;
6415 }
6416
6417 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6418 SDValue VX =
6419 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6420 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6421 DAG.getUNDEF(ExpVT), Exp, Zero);
6422 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6423 AArch64SVEPredPattern::all);
6424 SDValue FScale =
6426 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6427 VPg, VX, VExp);
6428 SDValue Final =
6429 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6430 if (X.getValueType() != XScalarTy)
6431 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6432 DAG.getIntPtrConstant(1, SDLoc(Op)));
6433 return Final;
6434}
6435
6437 SelectionDAG &DAG) const {
6438 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6439 LLVM_DEBUG(Op.dump());
6440
6441 switch (Op.getOpcode()) {
6442 default:
6443 llvm_unreachable("unimplemented operand");
6444 return SDValue();
6445 case ISD::BITCAST:
6446 return LowerBITCAST(Op, DAG);
6447 case ISD::GlobalAddress:
6448 return LowerGlobalAddress(Op, DAG);
6450 return LowerGlobalTLSAddress(Op, DAG);
6451 case ISD::SETCC:
6452 case ISD::STRICT_FSETCC:
6454 return LowerSETCC(Op, DAG);
6455 case ISD::SETCCCARRY:
6456 return LowerSETCCCARRY(Op, DAG);
6457 case ISD::BRCOND:
6458 return LowerBRCOND(Op, DAG);
6459 case ISD::BR_CC:
6460 return LowerBR_CC(Op, DAG);
6461 case ISD::SELECT:
6462 return LowerSELECT(Op, DAG);
6463 case ISD::SELECT_CC:
6464 return LowerSELECT_CC(Op, DAG);
6465 case ISD::JumpTable:
6466 return LowerJumpTable(Op, DAG);
6467 case ISD::BR_JT:
6468 return LowerBR_JT(Op, DAG);
6469 case ISD::ConstantPool:
6470 return LowerConstantPool(Op, DAG);
6471 case ISD::BlockAddress:
6472 return LowerBlockAddress(Op, DAG);
6473 case ISD::VASTART:
6474 return LowerVASTART(Op, DAG);
6475 case ISD::VACOPY:
6476 return LowerVACOPY(Op, DAG);
6477 case ISD::VAARG:
6478 return LowerVAARG(Op, DAG);
6479 case ISD::UADDO_CARRY:
6480 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6481 case ISD::USUBO_CARRY:
6482 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6483 case ISD::SADDO_CARRY:
6484 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6485 case ISD::SSUBO_CARRY:
6486 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6487 case ISD::SADDO:
6488 case ISD::UADDO:
6489 case ISD::SSUBO:
6490 case ISD::USUBO:
6491 case ISD::SMULO:
6492 case ISD::UMULO:
6493 return LowerXALUO(Op, DAG);
6494 case ISD::FADD:
6495 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6496 case ISD::FSUB:
6497 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6498 case ISD::FMUL:
6499 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6500 case ISD::FMA:
6501 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6502 case ISD::FDIV:
6503 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6504 case ISD::FNEG:
6505 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6506 case ISD::FCEIL:
6507 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6508 case ISD::FFLOOR:
6509 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6510 case ISD::FNEARBYINT:
6511 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6512 case ISD::FRINT:
6513 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6514 case ISD::FROUND:
6515 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6516 case ISD::FROUNDEVEN:
6517 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6518 case ISD::FTRUNC:
6519 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6520 case ISD::FSQRT:
6521 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6522 case ISD::FABS:
6523 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6524 case ISD::FP_ROUND:
6526 return LowerFP_ROUND(Op, DAG);
6527 case ISD::FP_EXTEND:
6528 return LowerFP_EXTEND(Op, DAG);
6529 case ISD::FRAMEADDR:
6530 return LowerFRAMEADDR(Op, DAG);
6531 case ISD::SPONENTRY:
6532 return LowerSPONENTRY(Op, DAG);
6533 case ISD::RETURNADDR:
6534 return LowerRETURNADDR(Op, DAG);
6536 return LowerADDROFRETURNADDR(Op, DAG);
6538 return LowerCONCAT_VECTORS(Op, DAG);
6540 return LowerINSERT_VECTOR_ELT(Op, DAG);
6542 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6543 case ISD::BUILD_VECTOR:
6544 return LowerBUILD_VECTOR(Op, DAG);
6546 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6548 return LowerVECTOR_SHUFFLE(Op, DAG);
6549 case ISD::SPLAT_VECTOR:
6550 return LowerSPLAT_VECTOR(Op, DAG);
6552 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6554 return LowerINSERT_SUBVECTOR(Op, DAG);
6555 case ISD::SDIV:
6556 case ISD::UDIV:
6557 return LowerDIV(Op, DAG);
6558 case ISD::SMIN:
6559 case ISD::UMIN:
6560 case ISD::SMAX:
6561 case ISD::UMAX:
6562 return LowerMinMax(Op, DAG);
6563 case ISD::SRA:
6564 case ISD::SRL:
6565 case ISD::SHL:
6566 return LowerVectorSRA_SRL_SHL(Op, DAG);
6567 case ISD::SHL_PARTS:
6568 case ISD::SRL_PARTS:
6569 case ISD::SRA_PARTS:
6570 return LowerShiftParts(Op, DAG);
6571 case ISD::CTPOP:
6572 case ISD::PARITY:
6573 return LowerCTPOP_PARITY(Op, DAG);
6574 case ISD::FCOPYSIGN:
6575 return LowerFCOPYSIGN(Op, DAG);
6576 case ISD::OR:
6577 return LowerVectorOR(Op, DAG);
6578 case ISD::XOR:
6579 return LowerXOR(Op, DAG);
6580 case ISD::PREFETCH:
6581 return LowerPREFETCH(Op, DAG);
6582 case ISD::SINT_TO_FP:
6583 case ISD::UINT_TO_FP:
6586 return LowerINT_TO_FP(Op, DAG);
6587 case ISD::FP_TO_SINT:
6588 case ISD::FP_TO_UINT:
6591 return LowerFP_TO_INT(Op, DAG);
6594 return LowerFP_TO_INT_SAT(Op, DAG);
6595 case ISD::FSINCOS:
6596 return LowerFSINCOS(Op, DAG);
6597 case ISD::GET_ROUNDING:
6598 return LowerGET_ROUNDING(Op, DAG);
6599 case ISD::SET_ROUNDING:
6600 return LowerSET_ROUNDING(Op, DAG);
6601 case ISD::GET_FPMODE:
6602 return LowerGET_FPMODE(Op, DAG);
6603 case ISD::SET_FPMODE:
6604 return LowerSET_FPMODE(Op, DAG);
6605 case ISD::RESET_FPMODE:
6606 return LowerRESET_FPMODE(Op, DAG);
6607 case ISD::MUL:
6608 return LowerMUL(Op, DAG);
6609 case ISD::MULHS:
6610 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6611 case ISD::MULHU:
6612 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6614 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6616 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6618 return LowerINTRINSIC_VOID(Op, DAG);
6619 case ISD::ATOMIC_STORE:
6620 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6621 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6622 return LowerStore128(Op, DAG);
6623 }
6624 return SDValue();
6625 case ISD::STORE:
6626 return LowerSTORE(Op, DAG);
6627 case ISD::MSTORE:
6628 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6629 case ISD::MGATHER:
6630 return LowerMGATHER(Op, DAG);
6631 case ISD::MSCATTER:
6632 return LowerMSCATTER(Op, DAG);
6634 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6635 case ISD::VECREDUCE_ADD:
6636 case ISD::VECREDUCE_AND:
6637 case ISD::VECREDUCE_OR:
6638 case ISD::VECREDUCE_XOR:
6648 return LowerVECREDUCE(Op, DAG);
6650 return LowerATOMIC_LOAD_AND(Op, DAG);
6652 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6653 case ISD::VSCALE:
6654 return LowerVSCALE(Op, DAG);
6655 case ISD::ANY_EXTEND:
6656 case ISD::SIGN_EXTEND:
6657 case ISD::ZERO_EXTEND:
6658 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6660 // Only custom lower when ExtraVT has a legal byte based element type.
6661 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6662 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6663 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6664 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6665 return SDValue();
6666
6667 return LowerToPredicatedOp(Op, DAG,
6669 }
6670 case ISD::TRUNCATE:
6671 return LowerTRUNCATE(Op, DAG);
6672 case ISD::MLOAD:
6673 return LowerMLOAD(Op, DAG);
6674 case ISD::LOAD:
6675 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6676 !Subtarget->isNeonAvailable()))
6677 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6678 return LowerLOAD(Op, DAG);
6679 case ISD::ADD:
6680 case ISD::AND:
6681 case ISD::SUB:
6682 return LowerToScalableOp(Op, DAG);
6683 case ISD::FMAXIMUM:
6684 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
6685 case ISD::FMAXNUM:
6686 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
6687 case ISD::FMINIMUM:
6688 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
6689 case ISD::FMINNUM:
6690 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
6691 case ISD::VSELECT:
6692 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
6693 case ISD::ABS:
6694 return LowerABS(Op, DAG);
6695 case ISD::ABDS:
6696 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
6697 case ISD::ABDU:
6698 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
6699 case ISD::AVGFLOORS:
6700 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
6701 case ISD::AVGFLOORU:
6702 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
6703 case ISD::AVGCEILS:
6704 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
6705 case ISD::AVGCEILU:
6706 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
6707 case ISD::BITREVERSE:
6708 return LowerBitreverse(Op, DAG);
6709 case ISD::BSWAP:
6710 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
6711 case ISD::CTLZ:
6712 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
6713 case ISD::CTTZ:
6714 return LowerCTTZ(Op, DAG);
6715 case ISD::VECTOR_SPLICE:
6716 return LowerVECTOR_SPLICE(Op, DAG);
6718 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
6720 return LowerVECTOR_INTERLEAVE(Op, DAG);
6721 case ISD::LRINT:
6722 case ISD::LLRINT:
6723 if (Op.getValueType().isVector())
6724 return LowerVectorXRINT(Op, DAG);
6725 [[fallthrough]];
6726 case ISD::LROUND:
6727 case ISD::LLROUND: {
6728 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
6729 Op.getOperand(0).getValueType() == MVT::bf16) &&
6730 "Expected custom lowering of rounding operations only for f16");
6731 SDLoc DL(Op);
6732 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
6733 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
6734 }
6735 case ISD::STRICT_LROUND:
6737 case ISD::STRICT_LRINT:
6738 case ISD::STRICT_LLRINT: {
6739 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
6740 Op.getOperand(1).getValueType() == MVT::bf16) &&
6741 "Expected custom lowering of rounding operations only for f16");
6742 SDLoc DL(Op);
6743 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
6744 {Op.getOperand(0), Op.getOperand(1)});
6745 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
6746 {Ext.getValue(1), Ext.getValue(0)});
6747 }
6748 case ISD::WRITE_REGISTER: {
6749 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
6750 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
6751 SDLoc DL(Op);
6752
6753 SDValue Chain = Op.getOperand(0);
6754 SDValue SysRegName = Op.getOperand(1);
6755 std::pair<SDValue, SDValue> Pair =
6756 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
6757
6758 // chain = MSRR(chain, sysregname, lo, hi)
6759 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
6760 SysRegName, Pair.first, Pair.second);
6761
6762 return Result;
6763 }
6764 case ISD::FSHL:
6765 case ISD::FSHR:
6766 return LowerFunnelShift(Op, DAG);
6767 case ISD::FLDEXP:
6768 return LowerFLDEXP(Op, DAG);
6769 }
6770}
6771
6773 return !Subtarget->useSVEForFixedLengthVectors();
6774}
6775
6777 EVT VT, bool OverrideNEON) const {
6778 if (!VT.isFixedLengthVector() || !VT.isSimple())
6779 return false;
6780
6781 // Don't use SVE for vectors we cannot scalarize if required.
6782 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
6783 // Fixed length predicates should be promoted to i8.
6784 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
6785 case MVT::i1:
6786 default:
6787 return false;
6788 case MVT::i8:
6789 case MVT::i16:
6790 case MVT::i32:
6791 case MVT::i64:
6792 case MVT::f16:
6793 case MVT::f32:
6794 case MVT::f64:
6795 break;
6796 }
6797
6798 // NEON-sized vectors can be emulated using SVE instructions.
6799 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
6800 return Subtarget->hasSVEorSME();
6801
6802 // Ensure NEON MVTs only belong to a single register class.
6803 if (VT.getFixedSizeInBits() <= 128)
6804 return false;
6805
6806 // Ensure wider than NEON code generation is enabled.
6807 if (!Subtarget->useSVEForFixedLengthVectors())
6808 return false;
6809
6810 // Don't use SVE for types that don't fit.
6811 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
6812 return false;
6813
6814 // TODO: Perhaps an artificial restriction, but worth having whilst getting
6815 // the base fixed length SVE support in place.
6816 if (!VT.isPow2VectorType())
6817 return false;
6818
6819 return true;
6820}
6821
6822//===----------------------------------------------------------------------===//
6823// Calling Convention Implementation
6824//===----------------------------------------------------------------------===//
6825
6826static unsigned getIntrinsicID(const SDNode *N) {
6827 unsigned Opcode = N->getOpcode();
6828 switch (Opcode) {
6829 default:
6832 unsigned IID = N->getConstantOperandVal(0);
6833 if (IID < Intrinsic::num_intrinsics)
6834 return IID;
6836 }
6837 }
6838}
6839
6841 SDValue N1) const {
6842 if (!N0.hasOneUse())
6843 return false;
6844
6845 unsigned IID = getIntrinsicID(N1.getNode());
6846 // Avoid reassociating expressions that can be lowered to smlal/umlal.
6847 if (IID == Intrinsic::aarch64_neon_umull ||
6848 N1.getOpcode() == AArch64ISD::UMULL ||
6849 IID == Intrinsic::aarch64_neon_smull ||
6851 return N0.getOpcode() != ISD::ADD;
6852
6853 return true;
6854}
6855
6856/// Selects the correct CCAssignFn for a given CallingConvention value.
6858 bool IsVarArg) const {
6859 switch (CC) {
6860 default:
6861 report_fatal_error("Unsupported calling convention.");
6862 case CallingConv::GHC:
6863 return CC_AArch64_GHC;
6864 case CallingConv::C:
6865 case CallingConv::Fast:
6869 case CallingConv::Swift:
6871 case CallingConv::Tail:
6872 case CallingConv::GRAAL:
6873 if (Subtarget->isTargetWindows()) {
6874 if (IsVarArg) {
6875 if (Subtarget->isWindowsArm64EC())
6878 }
6879 return CC_AArch64_Win64PCS;
6880 }
6881 if (!Subtarget->isTargetDarwin())
6882 return CC_AArch64_AAPCS;
6883 if (!IsVarArg)
6884 return CC_AArch64_DarwinPCS;
6887 case CallingConv::Win64:
6888 if (IsVarArg) {
6889 if (Subtarget->isWindowsArm64EC())
6892 }
6893 return CC_AArch64_Win64PCS;
6895 if (Subtarget->isWindowsArm64EC())
6902 return CC_AArch64_AAPCS;
6907 }
6908}
6909
6910CCAssignFn *
6912 switch (CC) {
6913 default:
6914 return RetCC_AArch64_AAPCS;
6918 if (Subtarget->isWindowsArm64EC())
6920 return RetCC_AArch64_AAPCS;
6921 }
6922}
6923
6924
6925unsigned
6926AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
6927 SelectionDAG &DAG) const {
6929 MachineFrameInfo &MFI = MF.getFrameInfo();
6930
6931 // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
6932 SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6933 DAG.getConstant(1, DL, MVT::i32));
6934 SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
6935 SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
6936 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
6937 SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
6938 Chain = Buffer.getValue(1);
6939 MFI.CreateVariableSizedObject(Align(1), nullptr);
6940
6941 // Allocate an additional TPIDR2 object on the stack (16 bytes)
6942 unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
6943
6944 // Store the buffer pointer to the TPIDR2 stack object.
6947 TPIDR2Obj,
6949 Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
6950
6951 // Set the reserved bytes (10-15) to zero
6952 EVT PtrTy = Ptr.getValueType();
6953 SDValue ReservedPtr =
6954 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
6955 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
6956 MPI);
6957 ReservedPtr =
6958 DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
6959 Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
6960 MPI);
6961
6962 return TPIDR2Obj;
6963}
6964
6965static bool isPassedInFPR(EVT VT) {
6966 return VT.isFixedLengthVector() ||
6967 (VT.isFloatingPoint() && !VT.isScalableVector());
6968}
6969
6970SDValue AArch64TargetLowering::LowerFormalArguments(
6971 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
6972 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6973 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
6975 const Function &F = MF.getFunction();
6976 MachineFrameInfo &MFI = MF.getFrameInfo();
6977 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
6978 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
6979 (isVarArg && Subtarget->isWindowsArm64EC());
6981
6983 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
6985 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
6986 FuncInfo->setIsSVECC(true);
6987
6988 // Assign locations to all of the incoming arguments.
6990 DenseMap<unsigned, SDValue> CopiedRegs;
6991 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
6992
6993 // At this point, Ins[].VT may already be promoted to i32. To correctly
6994 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
6995 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
6996 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
6997 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
6998 // LocVT.
6999 unsigned NumArgs = Ins.size();
7000 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7001 unsigned CurArgIdx = 0;
7002 for (unsigned i = 0; i != NumArgs; ++i) {
7003 MVT ValVT = Ins[i].VT;
7004 if (Ins[i].isOrigArg()) {
7005 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7006 CurArgIdx = Ins[i].getOrigArgIndex();
7007
7008 // Get type of the original argument.
7009 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7010 /*AllowUnknown*/ true);
7011 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7012 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7013 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7014 ValVT = MVT::i8;
7015 else if (ActualMVT == MVT::i16)
7016 ValVT = MVT::i16;
7017 }
7018 bool UseVarArgCC = false;
7019 if (IsWin64)
7020 UseVarArgCC = isVarArg;
7021 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7022 bool Res =
7023 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7024 assert(!Res && "Call operand has unhandled type");
7025 (void)Res;
7026 }
7027
7029 bool IsLocallyStreaming =
7030 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7031 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7032 SDValue Glue = Chain.getValue(1);
7033
7034 SmallVector<SDValue, 16> ArgValues;
7035 unsigned ExtraArgLocs = 0;
7036 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7037 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7038
7039 if (Ins[i].Flags.isByVal()) {
7040 // Byval is used for HFAs in the PCS, but the system should work in a
7041 // non-compliant manner for larger structs.
7042 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7043 int Size = Ins[i].Flags.getByValSize();
7044 unsigned NumRegs = (Size + 7) / 8;
7045
7046 // FIXME: This works on big-endian for composite byvals, which are the common
7047 // case. It should also work for fundamental types too.
7048 unsigned FrameIdx =
7049 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7050 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7051 InVals.push_back(FrameIdxN);
7052
7053 continue;
7054 }
7055
7056 if (Ins[i].Flags.isSwiftAsync())
7058
7059 SDValue ArgValue;
7060 if (VA.isRegLoc()) {
7061 // Arguments stored in registers.
7062 EVT RegVT = VA.getLocVT();
7063 const TargetRegisterClass *RC;
7064
7065 if (RegVT == MVT::i32)
7066 RC = &AArch64::GPR32RegClass;
7067 else if (RegVT == MVT::i64)
7068 RC = &AArch64::GPR64RegClass;
7069 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7070 RC = &AArch64::FPR16RegClass;
7071 else if (RegVT == MVT::f32)
7072 RC = &AArch64::FPR32RegClass;
7073 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7074 RC = &AArch64::FPR64RegClass;
7075 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7076 RC = &AArch64::FPR128RegClass;
7077 else if (RegVT.isScalableVector() &&
7078 RegVT.getVectorElementType() == MVT::i1) {
7079 FuncInfo->setIsSVECC(true);
7080 RC = &AArch64::PPRRegClass;
7081 } else if (RegVT == MVT::aarch64svcount) {
7082 FuncInfo->setIsSVECC(true);
7083 RC = &AArch64::PPRRegClass;
7084 } else if (RegVT.isScalableVector()) {
7085 FuncInfo->setIsSVECC(true);
7086 RC = &AArch64::ZPRRegClass;
7087 } else
7088 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7089
7090 // Transform the arguments in physical registers into virtual ones.
7091 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7092
7093 if (IsLocallyStreaming) {
7094 // LocallyStreamingFunctions must insert the SMSTART in the correct
7095 // position, so we use Glue to ensure no instructions can be scheduled
7096 // between the chain of:
7097 // t0: ch,glue = EntryNode
7098 // t1: res,ch,glue = CopyFromReg
7099 // ...
7100 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7101 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7102 // ^^^^^^
7103 // This will be the new Chain/Root node.
7104 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7105 Glue = ArgValue.getValue(2);
7106 if (isPassedInFPR(ArgValue.getValueType())) {
7107 ArgValue =
7109 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7110 {ArgValue, Glue});
7111 Glue = ArgValue.getValue(1);
7112 }
7113 } else
7114 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7115
7116 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7117 // to 64 bits. Insert an assert[sz]ext to capture this, then
7118 // truncate to the right size.
7119 switch (VA.getLocInfo()) {
7120 default:
7121 llvm_unreachable("Unknown loc info!");
7122 case CCValAssign::Full:
7123 break;
7125 assert(
7126 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7127 "Indirect arguments should be scalable on most subtargets");
7128 break;
7129 case CCValAssign::BCvt:
7130 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7131 break;
7132 case CCValAssign::AExt:
7133 case CCValAssign::SExt:
7134 case CCValAssign::ZExt:
7135 break;
7137 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7138 DAG.getConstant(32, DL, RegVT));
7139 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7140 break;
7141 }
7142 } else { // VA.isRegLoc()
7143 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7144 unsigned ArgOffset = VA.getLocMemOffset();
7145 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7146 ? VA.getLocVT().getSizeInBits()
7147 : VA.getValVT().getSizeInBits()) / 8;
7148
7149 uint32_t BEAlign = 0;
7150 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7151 !Ins[i].Flags.isInConsecutiveRegs())
7152 BEAlign = 8 - ArgSize;
7153
7154 SDValue FIN;
7155 MachinePointerInfo PtrInfo;
7156 if (StackViaX4) {
7157 // In both the ARM64EC varargs convention and the thunk convention,
7158 // arguments on the stack are accessed relative to x4, not sp. In
7159 // the thunk convention, there's an additional offset of 32 bytes
7160 // to account for the shadow store.
7161 unsigned ObjOffset = ArgOffset + BEAlign;
7162 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7163 ObjOffset += 32;
7164 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7165 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7166 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7167 DAG.getConstant(ObjOffset, DL, MVT::i64));
7169 } else {
7170 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7171
7172 // Create load nodes to retrieve arguments from the stack.
7173 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7174 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7175 }
7176
7177 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7179 MVT MemVT = VA.getValVT();
7180
7181 switch (VA.getLocInfo()) {
7182 default:
7183 break;
7184 case CCValAssign::Trunc:
7185 case CCValAssign::BCvt:
7186 MemVT = VA.getLocVT();
7187 break;
7190 Subtarget->isWindowsArm64EC()) &&
7191 "Indirect arguments should be scalable on most subtargets");
7192 MemVT = VA.getLocVT();
7193 break;
7194 case CCValAssign::SExt:
7195 ExtType = ISD::SEXTLOAD;
7196 break;
7197 case CCValAssign::ZExt:
7198 ExtType = ISD::ZEXTLOAD;
7199 break;
7200 case CCValAssign::AExt:
7201 ExtType = ISD::EXTLOAD;
7202 break;
7203 }
7204
7205 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7206 MemVT);
7207 }
7208
7209 if (VA.getLocInfo() == CCValAssign::Indirect) {
7210 assert((VA.getValVT().isScalableVT() ||
7211 Subtarget->isWindowsArm64EC()) &&
7212 "Indirect arguments should be scalable on most subtargets");
7213
7214 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7215 unsigned NumParts = 1;
7216 if (Ins[i].Flags.isInConsecutiveRegs()) {
7217 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
7218 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7219 ++NumParts;
7220 }
7221
7222 MVT PartLoad = VA.getValVT();
7223 SDValue Ptr = ArgValue;
7224
7225 // Ensure we generate all loads for each tuple part, whilst updating the
7226 // pointer after each load correctly using vscale.
7227 while (NumParts > 0) {
7228 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7229 InVals.push_back(ArgValue);
7230 NumParts--;
7231 if (NumParts > 0) {
7232 SDValue BytesIncrement;
7233 if (PartLoad.isScalableVector()) {
7234 BytesIncrement = DAG.getVScale(
7235 DL, Ptr.getValueType(),
7236 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7237 } else {
7238 BytesIncrement = DAG.getConstant(
7239 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7240 Ptr.getValueType());
7241 }
7243 Flags.setNoUnsignedWrap(true);
7244 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7245 BytesIncrement, Flags);
7246 ExtraArgLocs++;
7247 i++;
7248 }
7249 }
7250 } else {
7251 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7252 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7253 ArgValue, DAG.getValueType(MVT::i32));
7254
7255 // i1 arguments are zero-extended to i8 by the caller. Emit a
7256 // hint to reflect this.
7257 if (Ins[i].isOrigArg()) {
7258 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7259 if (OrigArg->getType()->isIntegerTy(1)) {
7260 if (!Ins[i].Flags.isZExt()) {
7261 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7262 ArgValue.getValueType(), ArgValue);
7263 }
7264 }
7265 }
7266
7267 InVals.push_back(ArgValue);
7268 }
7269 }
7270 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7271
7272 // Insert the SMSTART if this is a locally streaming function and
7273 // make sure it is Glued to the last CopyFromReg value.
7274 if (IsLocallyStreaming) {
7275 SDValue PStateSM;
7276 if (Attrs.hasStreamingCompatibleInterface()) {
7277 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7280 FuncInfo->setPStateSMReg(Reg);
7281 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7282 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7284 } else
7285 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7287
7288 // Ensure that the SMSTART happens after the CopyWithChain such that its
7289 // chain result is used.
7290 for (unsigned I=0; I<InVals.size(); ++I) {
7292 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7293 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7294 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7295 InVals[I].getValueType());
7296 }
7297 }
7298
7299 // varargs
7300 if (isVarArg) {
7301 if (!Subtarget->isTargetDarwin() || IsWin64) {
7302 // The AAPCS variadic function ABI is identical to the non-variadic
7303 // one. As a result there may be more arguments in registers and we should
7304 // save them for future reference.
7305 // Win64 variadic functions also pass arguments in registers, but all float
7306 // arguments are passed in integer registers.
7307 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7308 }
7309
7310 // This will point to the next argument passed via stack.
7311 unsigned VarArgsOffset = CCInfo.getStackSize();
7312 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7313 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7314 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7315 FuncInfo->setVarArgsStackIndex(
7316 MFI.CreateFixedObject(4, VarArgsOffset, true));
7317
7318 if (MFI.hasMustTailInVarArgFunc()) {
7319 SmallVector<MVT, 2> RegParmTypes;
7320 RegParmTypes.push_back(MVT::i64);
7321 RegParmTypes.push_back(MVT::f128);
7322 // Compute the set of forwarded registers. The rest are scratch.
7324 FuncInfo->getForwardedMustTailRegParms();
7325 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7327
7328 // Conservatively forward X8, since it might be used for aggregate return.
7329 if (!CCInfo.isAllocated(AArch64::X8)) {
7330 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7331 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7332 }
7333 }
7334 }
7335
7336 // On Windows, InReg pointers must be returned, so record the pointer in a
7337 // virtual register at the start of the function so it can be returned in the
7338 // epilogue.
7339 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7340 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7341 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7342 Ins[I].Flags.isInReg()) &&
7343 Ins[I].Flags.isSRet()) {
7344 assert(!FuncInfo->getSRetReturnReg());
7345
7346 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7347 Register Reg =
7349 FuncInfo->setSRetReturnReg(Reg);
7350
7351 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7352 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7353 break;
7354 }
7355 }
7356 }
7357
7358 unsigned StackArgSize = CCInfo.getStackSize();
7359 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7360 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7361 // This is a non-standard ABI so by fiat I say we're allowed to make full
7362 // use of the stack area to be popped, which must be aligned to 16 bytes in
7363 // any case:
7364 StackArgSize = alignTo(StackArgSize, 16);
7365
7366 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7367 // a multiple of 16.
7368 FuncInfo->setArgumentStackToRestore(StackArgSize);
7369
7370 // This realignment carries over to the available bytes below. Our own
7371 // callers will guarantee the space is free by giving an aligned value to
7372 // CALLSEQ_START.
7373 }
7374 // Even if we're not expected to free up the space, it's useful to know how
7375 // much is there while considering tail calls (because we can reuse it).
7376 FuncInfo->setBytesInStackArgArea(StackArgSize);
7377
7378 if (Subtarget->hasCustomCallingConv())
7380
7381 // Conservatively assume the function requires the lazy-save mechanism.
7382 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7383 unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7384 FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7385 }
7386
7387 return Chain;
7388}
7389
7390void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7391 SelectionDAG &DAG,
7392 const SDLoc &DL,
7393 SDValue &Chain) const {
7395 MachineFrameInfo &MFI = MF.getFrameInfo();
7397 auto PtrVT = getPointerTy(DAG.getDataLayout());
7398 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
7399
7401
7403 unsigned NumGPRArgRegs = GPRArgRegs.size();
7404 if (Subtarget->isWindowsArm64EC()) {
7405 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7406 // functions.
7407 NumGPRArgRegs = 4;
7408 }
7409 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7410
7411 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7412 int GPRIdx = 0;
7413 if (GPRSaveSize != 0) {
7414 if (IsWin64) {
7415 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7416 if (GPRSaveSize & 15)
7417 // The extra size here, if triggered, will always be 8.
7418 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7419 } else
7420 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7421
7422 SDValue FIN;
7423 if (Subtarget->isWindowsArm64EC()) {
7424 // With the Arm64EC ABI, we reserve the save area as usual, but we
7425 // compute its address relative to x4. For a normal AArch64->AArch64
7426 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7427 // different address.
7428 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7429 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7430 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7431 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7432 } else {
7433 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7434 }
7435
7436 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7437 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7438 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7439 SDValue Store =
7440 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7442 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7443 : MachinePointerInfo::getStack(MF, i * 8));
7444 MemOps.push_back(Store);
7445 FIN =
7446 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7447 }
7448 }
7449 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7450 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7451
7452 if (Subtarget->hasFPARMv8() && !IsWin64) {
7454 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7455 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7456
7457 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7458 int FPRIdx = 0;
7459 if (FPRSaveSize != 0) {
7460 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7461
7462 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7463
7464 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7465 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7466 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7467
7468 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7469 MachinePointerInfo::getStack(MF, i * 16));
7470 MemOps.push_back(Store);
7471 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7472 DAG.getConstant(16, DL, PtrVT));
7473 }
7474 }
7475 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7476 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7477 }
7478
7479 if (!MemOps.empty()) {
7480 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7481 }
7482}
7483
7484/// LowerCallResult - Lower the result values of a call into the
7485/// appropriate copies out of appropriate physical registers.
7486SDValue AArch64TargetLowering::LowerCallResult(
7487 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7488 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7489 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7490 SDValue ThisVal, bool RequiresSMChange) const {
7491 DenseMap<unsigned, SDValue> CopiedRegs;
7492 // Copy all of the result registers out of their specified physreg.
7493 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7494 CCValAssign VA = RVLocs[i];
7495
7496 // Pass 'this' value directly from the argument to return value, to avoid
7497 // reg unit interference
7498 if (i == 0 && isThisReturn) {
7499 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7500 "unexpected return calling convention register assignment");
7501 InVals.push_back(ThisVal);
7502 continue;
7503 }
7504
7505 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7506 // allows one use of a physreg per block.
7507 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7508 if (!Val) {
7509 Val =
7510 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7511 Chain = Val.getValue(1);
7512 InGlue = Val.getValue(2);
7513 CopiedRegs[VA.getLocReg()] = Val;
7514 }
7515
7516 switch (VA.getLocInfo()) {
7517 default:
7518 llvm_unreachable("Unknown loc info!");
7519 case CCValAssign::Full:
7520 break;
7521 case CCValAssign::BCvt:
7522 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7523 break;
7525 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7526 DAG.getConstant(32, DL, VA.getLocVT()));
7527 [[fallthrough]];
7528 case CCValAssign::AExt:
7529 [[fallthrough]];
7530 case CCValAssign::ZExt:
7531 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7532 break;
7533 }
7534
7535 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7537 Val);
7538
7539 InVals.push_back(Val);
7540 }
7541
7542 return Chain;
7543}
7544
7545/// Return true if the calling convention is one that we can guarantee TCO for.
7546static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7547 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7549}
7550
7551/// Return true if we might ever do TCO for calls with this calling convention.
7553 switch (CC) {
7554 case CallingConv::C:
7558 case CallingConv::Swift:
7560 case CallingConv::Tail:
7561 case CallingConv::Fast:
7562 return true;
7563 default:
7564 return false;
7565 }
7566}
7567
7569 const AArch64Subtarget *Subtarget,
7571 CCState &CCInfo) {
7572 const SelectionDAG &DAG = CLI.DAG;
7573 CallingConv::ID CalleeCC = CLI.CallConv;
7574 bool IsVarArg = CLI.IsVarArg;
7575 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7576 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
7577
7578 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7579 // for the shadow store.
7580 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7581 CCInfo.AllocateStack(32, Align(16));
7582
7583 unsigned NumArgs = Outs.size();
7584 for (unsigned i = 0; i != NumArgs; ++i) {
7585 MVT ArgVT = Outs[i].VT;
7586 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7587
7588 bool UseVarArgCC = false;
7589 if (IsVarArg) {
7590 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7591 // too, so use the vararg CC to force them to integer registers.
7592 if (IsCalleeWin64) {
7593 UseVarArgCC = true;
7594 } else {
7595 UseVarArgCC = !Outs[i].IsFixed;
7596 }
7597 }
7598
7599 if (!UseVarArgCC) {
7600 // Get type of the original argument.
7601 EVT ActualVT =
7602 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7603 /*AllowUnknown*/ true);
7604 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7605 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7606 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7607 ArgVT = MVT::i8;
7608 else if (ActualMVT == MVT::i16)
7609 ArgVT = MVT::i16;
7610 }
7611
7612 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7613 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7614 assert(!Res && "Call operand has unhandled type");
7615 (void)Res;
7616 }
7617}
7618
7619bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7620 const CallLoweringInfo &CLI) const {
7621 CallingConv::ID CalleeCC = CLI.CallConv;
7622 if (!mayTailCallThisCC(CalleeCC))
7623 return false;
7624
7625 SDValue Callee = CLI.Callee;
7626 bool IsVarArg = CLI.IsVarArg;
7627 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7628 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7629 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7630 const SelectionDAG &DAG = CLI.DAG;
7632 const Function &CallerF = MF.getFunction();
7633 CallingConv::ID CallerCC = CallerF.getCallingConv();
7634
7635 // SME Streaming functions are not eligible for TCO as they may require
7636 // the streaming mode or ZA to be restored after returning from the call.
7637 SMEAttrs CallerAttrs(MF.getFunction());
7638 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7639 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7640 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7641 CallerAttrs.hasStreamingBody())
7642 return false;
7643
7644 // Functions using the C or Fast calling convention that have an SVE signature
7645 // preserve more registers and should assume the SVE_VectorCall CC.
7646 // The check for matching callee-saved regs will determine whether it is
7647 // eligible for TCO.
7648 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7651
7652 bool CCMatch = CallerCC == CalleeCC;
7653
7654 // When using the Windows calling convention on a non-windows OS, we want
7655 // to back up and restore X18 in such functions; we can't do a tail call
7656 // from those functions.
7657 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7658 CalleeCC != CallingConv::Win64)
7659 return false;
7660
7661 // Byval parameters hand the function a pointer directly into the stack area
7662 // we want to reuse during a tail call. Working around this *is* possible (see
7663 // X86) but less efficient and uglier in LowerCall.
7664 for (Function::const_arg_iterator i = CallerF.arg_begin(),
7665 e = CallerF.arg_end();
7666 i != e; ++i) {
7667 if (i->hasByValAttr())
7668 return false;
7669
7670 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
7671 // In this case, it is necessary to save/restore X0 in the callee. Tail
7672 // call opt interferes with this. So we disable tail call opt when the
7673 // caller has an argument with "inreg" attribute.
7674
7675 // FIXME: Check whether the callee also has an "inreg" argument.
7676 if (i->hasInRegAttr())
7677 return false;
7678 }
7679
7680 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
7681 return CCMatch;
7682
7683 // Externally-defined functions with weak linkage should not be
7684 // tail-called on AArch64 when the OS does not support dynamic
7685 // pre-emption of symbols, as the AAELF spec requires normal calls
7686 // to undefined weak functions to be replaced with a NOP or jump to the
7687 // next instruction. The behaviour of branch instructions in this
7688 // situation (as used for tail calls) is implementation-defined, so we
7689 // cannot rely on the linker replacing the tail call with a return.
7690 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
7691 const GlobalValue *GV = G->getGlobal();
7693 if (GV->hasExternalWeakLinkage() &&
7694 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
7695 return false;
7696 }
7697
7698 // Now we search for cases where we can use a tail call without changing the
7699 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
7700 // concept.
7701
7702 // I want anyone implementing a new calling convention to think long and hard
7703 // about this assert.
7704 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
7705 "Unexpected variadic calling convention");
7706
7707 LLVMContext &C = *DAG.getContext();
7708 // Check that the call results are passed in the same way.
7709 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
7710 CCAssignFnForCall(CalleeCC, IsVarArg),
7711 CCAssignFnForCall(CallerCC, IsVarArg)))
7712 return false;
7713 // The callee has to preserve all registers the caller needs to preserve.
7714 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7715 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
7716 if (!CCMatch) {
7717 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
7718 if (Subtarget->hasCustomCallingConv()) {
7719 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
7720 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
7721 }
7722 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
7723 return false;
7724 }
7725
7726 // Nothing more to check if the callee is taking no arguments
7727 if (Outs.empty())
7728 return true;
7729
7731 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
7732
7733 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7734
7735 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
7736 // When we are musttail, additional checks have been done and we can safely ignore this check
7737 // At least two cases here: if caller is fastcc then we can't have any
7738 // memory arguments (we'd be expected to clean up the stack afterwards). If
7739 // caller is C then we could potentially use its argument area.
7740
7741 // FIXME: for now we take the most conservative of these in both cases:
7742 // disallow all variadic memory operands.
7743 for (const CCValAssign &ArgLoc : ArgLocs)
7744 if (!ArgLoc.isRegLoc())
7745 return false;
7746 }
7747
7748 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7749
7750 // If any of the arguments is passed indirectly, it must be SVE, so the
7751 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
7752 // allocate space on the stack. That is why we determine this explicitly here
7753 // the call cannot be a tailcall.
7754 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
7755 assert((A.getLocInfo() != CCValAssign::Indirect ||
7756 A.getValVT().isScalableVector() ||
7757 Subtarget->isWindowsArm64EC()) &&
7758 "Expected value to be scalable");
7759 return A.getLocInfo() == CCValAssign::Indirect;
7760 }))
7761 return false;
7762
7763 // If the stack arguments for this call do not fit into our own save area then
7764 // the call cannot be made tail.
7765 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
7766 return false;
7767
7768 const MachineRegisterInfo &MRI = MF.getRegInfo();
7769 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
7770 return false;
7771
7772 return true;
7773}
7774
7775SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
7776 SelectionDAG &DAG,
7777 MachineFrameInfo &MFI,
7778 int ClobberedFI) const {
7779 SmallVector<SDValue, 8> ArgChains;
7780 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
7781 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
7782
7783 // Include the original chain at the beginning of the list. When this is
7784 // used by target LowerCall hooks, this helps legalize find the
7785 // CALLSEQ_BEGIN node.
7786 ArgChains.push_back(Chain);
7787
7788 // Add a chain value for each stack argument corresponding
7789 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
7790 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
7791 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
7792 if (FI->getIndex() < 0) {
7793 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
7794 int64_t InLastByte = InFirstByte;
7795 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
7796
7797 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
7798 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
7799 ArgChains.push_back(SDValue(L, 1));
7800 }
7801
7802 // Build a tokenfactor for all the chains.
7803 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
7804}
7805
7806bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
7807 bool TailCallOpt) const {
7808 return (CallCC == CallingConv::Fast && TailCallOpt) ||
7809 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
7810}
7811
7812// Check if the value is zero-extended from i1 to i8
7813static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
7814 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
7815 if (SizeInBits < 8)
7816 return false;
7817
7818 APInt RequredZero(SizeInBits, 0xFE);
7819 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
7820 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
7821 return ZExtBool;
7822}
7823
7824void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
7825 SDNode *Node) const {
7826 // Live-in physreg copies that are glued to SMSTART are applied as
7827 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
7828 // register allocator to pass call args in callee saved regs, without extra
7829 // copies to avoid these fake clobbers of actually-preserved GPRs.
7830 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
7831 MI.getOpcode() == AArch64::MSRpstatePseudo) {
7832 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
7833 if (MachineOperand &MO = MI.getOperand(I);
7834 MO.isReg() && MO.isImplicit() && MO.isDef() &&
7835 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
7836 AArch64::GPR64RegClass.contains(MO.getReg())))
7837 MI.removeOperand(I);
7838
7839 // The SVE vector length can change when entering/leaving streaming mode.
7840 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
7841 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
7842 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7843 /*IsImplicit=*/true));
7844 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
7845 /*IsImplicit=*/true));
7846 }
7847 }
7848
7849 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
7850 // have nothing to do with VG, were it not that they are used to materialise a
7851 // frame-address. If they contain a frame-index to a scalable vector, this
7852 // will likely require an ADDVL instruction to materialise the address, thus
7853 // reading VG.
7854 const MachineFunction &MF = *MI.getMF();
7856 (MI.getOpcode() == AArch64::ADDXri ||
7857 MI.getOpcode() == AArch64::SUBXri)) {
7858 const MachineOperand &MO = MI.getOperand(1);
7859 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
7861 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
7862 /*IsImplicit=*/true));
7863 }
7864}
7865
7867 bool Enable, SDValue Chain,
7868 SDValue InGlue,
7869 unsigned Condition,
7870 SDValue PStateSM) const {
7873 FuncInfo->setHasStreamingModeChanges(true);
7874
7875 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7876 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
7877 SDValue MSROp =
7878 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
7879 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
7880 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
7881 if (Condition != AArch64SME::Always) {
7882 assert(PStateSM && "PStateSM should be defined");
7883 Ops.push_back(PStateSM);
7884 }
7885 Ops.push_back(RegMask);
7886
7887 if (InGlue)
7888 Ops.push_back(InGlue);
7889
7890 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
7891 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
7892}
7893
7894static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
7895 const SMEAttrs &CalleeAttrs) {
7896 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
7897 CallerAttrs.hasStreamingBody())
7898 return AArch64SME::Always;
7899 if (CalleeAttrs.hasNonStreamingInterface())
7901 if (CalleeAttrs.hasStreamingInterface())
7903
7904 llvm_unreachable("Unsupported attributes");
7905}
7906
7907/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
7908/// and add input and output parameter nodes.
7909SDValue
7910AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
7911 SmallVectorImpl<SDValue> &InVals) const {
7912 SelectionDAG &DAG = CLI.DAG;
7913 SDLoc &DL = CLI.DL;
7914 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7915 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7917 SDValue Chain = CLI.Chain;
7918 SDValue Callee = CLI.Callee;
7919 bool &IsTailCall = CLI.IsTailCall;
7920 CallingConv::ID &CallConv = CLI.CallConv;
7921 bool IsVarArg = CLI.IsVarArg;
7922
7925 bool IsThisReturn = false;
7926
7928 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7929 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
7930 bool IsSibCall = false;
7931 bool GuardWithBTI = false;
7932
7933 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
7934 !Subtarget->noBTIAtReturnTwice()) {
7935 GuardWithBTI = FuncInfo->branchTargetEnforcement();
7936 }
7937
7938 // Analyze operands of the call, assigning locations to each operand.
7940 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
7941
7942 if (IsVarArg) {
7943 unsigned NumArgs = Outs.size();
7944
7945 for (unsigned i = 0; i != NumArgs; ++i) {
7946 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
7947 report_fatal_error("Passing SVE types to variadic functions is "
7948 "currently not supported");
7949 }
7950 }
7951
7952 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
7953
7954 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7955 // Assign locations to each value returned by this call.
7957 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
7958 *DAG.getContext());
7959 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
7960
7961 // Check callee args/returns for SVE registers and set calling convention
7962 // accordingly.
7963 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
7964 auto HasSVERegLoc = [](CCValAssign &Loc) {
7965 if (!Loc.isRegLoc())
7966 return false;
7967 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
7968 AArch64::PPRRegClass.contains(Loc.getLocReg());
7969 };
7970 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
7972 }
7973
7974 if (IsTailCall) {
7975 // Check if it's really possible to do a tail call.
7976 IsTailCall = isEligibleForTailCallOptimization(CLI);
7977
7978 // A sibling call is one where we're under the usual C ABI and not planning
7979 // to change that but can still do a tail call:
7980 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
7981 CallConv != CallingConv::SwiftTail)
7982 IsSibCall = true;
7983
7984 if (IsTailCall)
7985 ++NumTailCalls;
7986 }
7987
7988 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
7989 report_fatal_error("failed to perform tail call elimination on a call "
7990 "site marked musttail");
7991
7992 // Get a count of how many bytes are to be pushed on the stack.
7993 unsigned NumBytes = CCInfo.getStackSize();
7994
7995 if (IsSibCall) {
7996 // Since we're not changing the ABI to make this a tail call, the memory
7997 // operands are already available in the caller's incoming argument space.
7998 NumBytes = 0;
7999 }
8000
8001 // FPDiff is the byte offset of the call's argument area from the callee's.
8002 // Stores to callee stack arguments will be placed in FixedStackSlots offset
8003 // by this amount for a tail call. In a sibling call it must be 0 because the
8004 // caller will deallocate the entire stack and the callee still expects its
8005 // arguments to begin at SP+0. Completely unused for non-tail calls.
8006 int FPDiff = 0;
8007
8008 if (IsTailCall && !IsSibCall) {
8009 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8010
8011 // Since callee will pop argument stack as a tail call, we must keep the
8012 // popped size 16-byte aligned.
8013 NumBytes = alignTo(NumBytes, 16);
8014
8015 // FPDiff will be negative if this tail call requires more space than we
8016 // would automatically have in our incoming argument space. Positive if we
8017 // can actually shrink the stack.
8018 FPDiff = NumReusableBytes - NumBytes;
8019
8020 // Update the required reserved area if this is the tail call requiring the
8021 // most argument stack space.
8022 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8023 FuncInfo->setTailCallReservedStack(-FPDiff);
8024
8025 // The stack pointer must be 16-byte aligned at all times it's used for a
8026 // memory operation, which in practice means at *all* times and in
8027 // particular across call boundaries. Therefore our own arguments started at
8028 // a 16-byte aligned SP and the delta applied for the tail call should
8029 // satisfy the same constraint.
8030 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
8031 }
8032
8033 // Determine whether we need any streaming mode changes.
8034 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
8035 if (CLI.CB)
8036 CalleeAttrs = SMEAttrs(*CLI.CB);
8037 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8038 CalleeAttrs = SMEAttrs(ES->getSymbol());
8039
8040 auto DescribeCallsite =
8042 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
8043 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8044 R << ore::NV("Callee", ES->getSymbol());
8045 else if (CLI.CB && CLI.CB->getCalledFunction())
8046 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
8047 else
8048 R << "unknown callee";
8049 R << "'";
8050 return R;
8051 };
8052
8053 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
8054 if (RequiresLazySave) {
8055 unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
8057 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
8059 SDValue NumZaSaveSlicesAddr =
8060 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
8061 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
8062 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8063 DAG.getConstant(1, DL, MVT::i32));
8064 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
8065 MPI, MVT::i16);
8066 Chain = DAG.getNode(
8067 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8068 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8069 TPIDR2ObjAddr);
8071 ORE.emit([&]() {
8072 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8073 CLI.CB)
8074 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8075 &MF.getFunction());
8076 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8077 });
8078 }
8079
8080 SDValue PStateSM;
8081 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8082 if (RequiresSMChange) {
8083 if (CallerAttrs.hasStreamingInterfaceOrBody())
8084 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8085 else if (CallerAttrs.hasNonStreamingInterface())
8086 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8087 else
8088 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8090 ORE.emit([&]() {
8091 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8092 CLI.CB)
8093 : OptimizationRemarkAnalysis("sme", "SMETransition",
8094 &MF.getFunction());
8095 DescribeCallsite(R) << " requires a streaming mode transition";
8096 return R;
8097 });
8098 }
8099
8100 SDValue ZTFrameIdx;
8101 MachineFrameInfo &MFI = MF.getFrameInfo();
8102 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8103
8104 // If the caller has ZT0 state which will not be preserved by the callee,
8105 // spill ZT0 before the call.
8106 if (ShouldPreserveZT0) {
8107 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8108 ZTFrameIdx = DAG.getFrameIndex(
8109 ZTObj,
8111
8112 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8113 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8114 }
8115
8116 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8117 // PSTATE.ZA before the call if there is no lazy-save active.
8118 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8119 assert((!DisableZA || !RequiresLazySave) &&
8120 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8121
8122 if (DisableZA)
8123 Chain = DAG.getNode(
8124 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8125 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8126 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8127
8128 // Adjust the stack pointer for the new arguments...
8129 // These operations are automatically eliminated by the prolog/epilog pass
8130 if (!IsSibCall)
8131 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8132
8133 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8135
8137 SmallSet<unsigned, 8> RegsUsed;
8138 SmallVector<SDValue, 8> MemOpChains;
8139 auto PtrVT = getPointerTy(DAG.getDataLayout());
8140
8141 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8142 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8143 for (const auto &F : Forwards) {
8144 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8145 RegsToPass.emplace_back(F.PReg, Val);
8146 }
8147 }
8148
8149 // Walk the register/memloc assignments, inserting copies/loads.
8150 unsigned ExtraArgLocs = 0;
8151 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8152 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8153 SDValue Arg = OutVals[i];
8154 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8155
8156 // Promote the value if needed.
8157 switch (VA.getLocInfo()) {
8158 default:
8159 llvm_unreachable("Unknown loc info!");
8160 case CCValAssign::Full:
8161 break;
8162 case CCValAssign::SExt:
8163 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8164 break;
8165 case CCValAssign::ZExt:
8166 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8167 break;
8168 case CCValAssign::AExt:
8169 if (Outs[i].ArgVT == MVT::i1) {
8170 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8171 //
8172 // Check if we actually have to do this, because the value may
8173 // already be zero-extended.
8174 //
8175 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8176 // and rely on DAGCombiner to fold this, because the following
8177 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8178 //
8179 // (ext (zext x)) -> (zext x)
8180 //
8181 // This will give us (zext i32), which we cannot remove, so
8182 // try to check this beforehand.
8183 if (!checkZExtBool(Arg, DAG)) {
8184 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8185 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8186 }
8187 }
8188 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8189 break;
8191 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8192 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8193 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8194 DAG.getConstant(32, DL, VA.getLocVT()));
8195 break;
8196 case CCValAssign::BCvt:
8197 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8198 break;
8199 case CCValAssign::Trunc:
8200 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8201 break;
8202 case CCValAssign::FPExt:
8203 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8204 break;
8206 bool isScalable = VA.getValVT().isScalableVT();
8207 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8208 "Indirect arguments should be scalable on most subtargets");
8209
8210 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8211 uint64_t PartSize = StoreSize;
8212 unsigned NumParts = 1;
8213 if (Outs[i].Flags.isInConsecutiveRegs()) {
8214 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
8215 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8216 ++NumParts;
8217 StoreSize *= NumParts;
8218 }
8219
8220 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8221 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8222 MachineFrameInfo &MFI = MF.getFrameInfo();
8223 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8224 if (isScalable)
8226
8230 SDValue SpillSlot = Ptr;
8231
8232 // Ensure we generate all stores for each tuple part, whilst updating the
8233 // pointer after each store correctly using vscale.
8234 while (NumParts) {
8235 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8236 MemOpChains.push_back(Store);
8237
8238 NumParts--;
8239 if (NumParts > 0) {
8240 SDValue BytesIncrement;
8241 if (isScalable) {
8242 BytesIncrement = DAG.getVScale(
8243 DL, Ptr.getValueType(),
8244 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8245 } else {
8246 BytesIncrement = DAG.getConstant(
8247 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8248 Ptr.getValueType());
8249 }
8251 Flags.setNoUnsignedWrap(true);
8252
8253 MPI = MachinePointerInfo(MPI.getAddrSpace());
8254 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8255 BytesIncrement, Flags);
8256 ExtraArgLocs++;
8257 i++;
8258 }
8259 }
8260
8261 Arg = SpillSlot;
8262 break;
8263 }
8264
8265 if (VA.isRegLoc()) {
8266 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8267 Outs[0].VT == MVT::i64) {
8268 assert(VA.getLocVT() == MVT::i64 &&
8269 "unexpected calling convention register assignment");
8270 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8271 "unexpected use of 'returned'");
8272 IsThisReturn = true;
8273 }
8274 if (RegsUsed.count(VA.getLocReg())) {
8275 // If this register has already been used then we're trying to pack
8276 // parts of an [N x i32] into an X-register. The extension type will
8277 // take care of putting the two halves in the right place but we have to
8278 // combine them.
8279 SDValue &Bits =
8280 llvm::find_if(RegsToPass,
8281 [=](const std::pair<unsigned, SDValue> &Elt) {
8282 return Elt.first == VA.getLocReg();
8283 })
8284 ->second;
8285 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8286 // Call site info is used for function's parameter entry value
8287 // tracking. For now we track only simple cases when parameter
8288 // is transferred through whole register.
8290 [&VA](MachineFunction::ArgRegPair ArgReg) {
8291 return ArgReg.Reg == VA.getLocReg();
8292 });
8293 } else {
8294 // Add an extra level of indirection for streaming mode changes by
8295 // using a pseudo copy node that cannot be rematerialised between a
8296 // smstart/smstop and the call by the simple register coalescer.
8297 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8299 Arg.getValueType(), Arg);
8300 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8301 RegsUsed.insert(VA.getLocReg());
8302 const TargetOptions &Options = DAG.getTarget().Options;
8303 if (Options.EmitCallSiteInfo)
8304 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8305 }
8306 } else {
8307 assert(VA.isMemLoc());
8308
8309 SDValue DstAddr;
8310 MachinePointerInfo DstInfo;
8311
8312 // FIXME: This works on big-endian for composite byvals, which are the
8313 // common case. It should also work for fundamental types too.
8314 uint32_t BEAlign = 0;
8315 unsigned OpSize;
8316 if (VA.getLocInfo() == CCValAssign::Indirect ||
8318 OpSize = VA.getLocVT().getFixedSizeInBits();
8319 else
8320 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8321 : VA.getValVT().getSizeInBits();
8322 OpSize = (OpSize + 7) / 8;
8323 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8324 !Flags.isInConsecutiveRegs()) {
8325 if (OpSize < 8)
8326 BEAlign = 8 - OpSize;
8327 }
8328 unsigned LocMemOffset = VA.getLocMemOffset();
8329 int32_t Offset = LocMemOffset + BEAlign;
8330 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8331 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8332
8333 if (IsTailCall) {
8334 Offset = Offset + FPDiff;
8335 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8336
8337 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8338 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8339
8340 // Make sure any stack arguments overlapping with where we're storing
8341 // are loaded before this eventual operation. Otherwise they'll be
8342 // clobbered.
8343 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8344 } else {
8345 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8346
8347 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8348 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8349 }
8350
8351 if (Outs[i].Flags.isByVal()) {
8352 SDValue SizeNode =
8353 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8354 SDValue Cpy = DAG.getMemcpy(
8355 Chain, DL, DstAddr, Arg, SizeNode,
8356 Outs[i].Flags.getNonZeroByValAlign(),
8357 /*isVol = */ false, /*AlwaysInline = */ false,
8358 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
8359
8360 MemOpChains.push_back(Cpy);
8361 } else {
8362 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8363 // promoted to a legal register type i32, we should truncate Arg back to
8364 // i1/i8/i16.
8365 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8366 VA.getValVT() == MVT::i16)
8367 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8368
8369 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8370 MemOpChains.push_back(Store);
8371 }
8372 }
8373 }
8374
8375 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8376 SDValue ParamPtr = StackPtr;
8377 if (IsTailCall) {
8378 // Create a dummy object at the top of the stack that can be used to get
8379 // the SP after the epilogue
8380 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8381 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8382 }
8383
8384 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8385 // describing the argument list. x4 contains the address of the
8386 // first stack parameter. x5 contains the size in bytes of all parameters
8387 // passed on the stack.
8388 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8389 RegsToPass.emplace_back(AArch64::X5,
8390 DAG.getConstant(NumBytes, DL, MVT::i64));
8391 }
8392
8393 if (!MemOpChains.empty())
8394 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8395
8396 SDValue InGlue;
8397 if (RequiresSMChange) {
8398 SDValue NewChain = changeStreamingMode(
8399 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8400 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8401 Chain = NewChain.getValue(0);
8402 InGlue = NewChain.getValue(1);
8403 }
8404
8405 // Build a sequence of copy-to-reg nodes chained together with token chain
8406 // and flag operands which copy the outgoing args into the appropriate regs.
8407 for (auto &RegToPass : RegsToPass) {
8408 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8409 RegToPass.second, InGlue);
8410 InGlue = Chain.getValue(1);
8411 }
8412
8413 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8414 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8415 // node so that legalize doesn't hack it.
8416 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8417 auto GV = G->getGlobal();
8418 unsigned OpFlags =
8420 if (OpFlags & AArch64II::MO_GOT) {
8421 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8422 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8423 } else {
8424 const GlobalValue *GV = G->getGlobal();
8425 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8426 }
8427 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8428 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8429 Subtarget->isTargetMachO()) ||
8431 const char *Sym = S->getSymbol();
8432 if (UseGot) {
8434 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8435 } else {
8436 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8437 }
8438 }
8439
8440 // We don't usually want to end the call-sequence here because we would tidy
8441 // the frame up *after* the call, however in the ABI-changing tail-call case
8442 // we've carefully laid out the parameters so that when sp is reset they'll be
8443 // in the correct location.
8444 if (IsTailCall && !IsSibCall) {
8445 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8446 InGlue = Chain.getValue(1);
8447 }
8448
8449 std::vector<SDValue> Ops;
8450 Ops.push_back(Chain);
8451 Ops.push_back(Callee);
8452
8453 if (IsTailCall) {
8454 // Each tail call may have to adjust the stack by a different amount, so
8455 // this information must travel along with the operation for eventual
8456 // consumption by emitEpilogue.
8457 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8458 }
8459
8460 // Add argument registers to the end of the list so that they are known live
8461 // into the call.
8462 for (auto &RegToPass : RegsToPass)
8463 Ops.push_back(DAG.getRegister(RegToPass.first,
8464 RegToPass.second.getValueType()));
8465
8466 // Add a register mask operand representing the call-preserved registers.
8467 const uint32_t *Mask;
8468 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8469 if (IsThisReturn) {
8470 // For 'this' returns, use the X0-preserving mask if applicable
8471 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8472 if (!Mask) {
8473 IsThisReturn = false;
8474 Mask = TRI->getCallPreservedMask(MF, CallConv);
8475 }
8476 } else
8477 Mask = TRI->getCallPreservedMask(MF, CallConv);
8478
8479 if (Subtarget->hasCustomCallingConv())
8480 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8481
8482 if (TRI->isAnyArgRegReserved(MF))
8483 TRI->emitReservedArgRegCallError(MF);
8484
8485 assert(Mask && "Missing call preserved mask for calling convention");
8486 Ops.push_back(DAG.getRegisterMask(Mask));
8487
8488 if (InGlue.getNode())
8489 Ops.push_back(InGlue);
8490
8491 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8492
8493 // If we're doing a tall call, use a TC_RETURN here rather than an
8494 // actual call instruction.
8495 if (IsTailCall) {
8497 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
8498
8499 if (IsCFICall)
8500 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8501
8502 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8503 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8504 return Ret;
8505 }
8506
8507 unsigned CallOpc = AArch64ISD::CALL;
8508 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8509 // be expanded to the call, directly followed by a special marker sequence and
8510 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8511 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8512 assert(!IsTailCall &&
8513 "tail calls cannot be marked with clang.arc.attachedcall");
8514 CallOpc = AArch64ISD::CALL_RVMARKER;
8515
8516 // Add a target global address for the retainRV/claimRV runtime function
8517 // just before the call target.
8518 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8519 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8520 Ops.insert(Ops.begin() + 1, GA);
8521 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8523 } else if (GuardWithBTI) {
8524 CallOpc = AArch64ISD::CALL_BTI;
8525 }
8526
8527 // Returns a chain and a flag for retval copy to use.
8528 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
8529
8530 if (IsCFICall)
8531 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8532
8533 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8534 InGlue = Chain.getValue(1);
8535 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8536
8537 uint64_t CalleePopBytes =
8538 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8539
8540 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8541 InGlue = Chain.getValue(1);
8542
8543 // Handle result values, copying them out of physregs into vregs that we
8544 // return.
8545 SDValue Result = LowerCallResult(
8546 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8547 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8548
8549 if (!Ins.empty())
8550 InGlue = Result.getValue(Result->getNumValues() - 1);
8551
8552 if (RequiresSMChange) {
8553 assert(PStateSM && "Expected a PStateSM to be set");
8555 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8556 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8557 }
8558
8559 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8560 // Unconditionally resume ZA.
8561 Result = DAG.getNode(
8562 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8563 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8564 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8565
8566 if (ShouldPreserveZT0)
8567 Result =
8568 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8569 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8570
8571 if (RequiresLazySave) {
8572 // Conditionally restore the lazy save using a pseudo node.
8573 unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
8574 SDValue RegMask = DAG.getRegisterMask(
8575 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8576 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8577 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8578 SDValue TPIDR2_EL0 = DAG.getNode(
8579 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8580 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8581
8582 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8583 // RESTORE_ZA pseudo.
8584 SDValue Glue;
8585 SDValue TPIDR2Block = DAG.getFrameIndex(
8587 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8588 Result =
8589 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8590 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8591 RestoreRoutine, RegMask, Result.getValue(1)});
8592
8593 // Finally reset the TPIDR2_EL0 register to 0.
8594 Result = DAG.getNode(
8595 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8596 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8597 DAG.getConstant(0, DL, MVT::i64));
8598 }
8599
8600 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8601 for (unsigned I = 0; I < InVals.size(); ++I) {
8602 // The smstart/smstop is chained as part of the call, but when the
8603 // resulting chain is discarded (which happens when the call is not part
8604 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8605 // smstart/smstop is chained to the result value. We can do that by doing
8606 // a vreg -> vreg copy.
8608 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8609 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8610 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8611 InVals[I].getValueType());
8612 }
8613 }
8614
8615 return Result;
8616}
8617
8618bool AArch64TargetLowering::CanLowerReturn(
8619 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
8620 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
8621 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8623 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
8624 return CCInfo.CheckReturn(Outs, RetCC);
8625}
8626
8627SDValue
8628AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
8629 bool isVarArg,
8631 const SmallVectorImpl<SDValue> &OutVals,
8632 const SDLoc &DL, SelectionDAG &DAG) const {
8633 auto &MF = DAG.getMachineFunction();
8634 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8635
8636 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8638 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
8639 CCInfo.AnalyzeReturn(Outs, RetCC);
8640
8641 // Copy the result values into the output registers.
8642 SDValue Glue;
8644 SmallSet<unsigned, 4> RegsUsed;
8645 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
8646 ++i, ++realRVLocIdx) {
8647 CCValAssign &VA = RVLocs[i];
8648 assert(VA.isRegLoc() && "Can only return in registers!");
8649 SDValue Arg = OutVals[realRVLocIdx];
8650
8651 switch (VA.getLocInfo()) {
8652 default:
8653 llvm_unreachable("Unknown loc info!");
8654 case CCValAssign::Full:
8655 if (Outs[i].ArgVT == MVT::i1) {
8656 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
8657 // value. This is strictly redundant on Darwin (which uses "zeroext
8658 // i1"), but will be optimised out before ISel.
8659 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8660 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8661 }
8662 break;
8663 case CCValAssign::BCvt:
8664 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
8665 break;
8666 case CCValAssign::AExt:
8667 case CCValAssign::ZExt:
8668 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8669 break;
8671 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8672 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8673 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8674 DAG.getConstant(32, DL, VA.getLocVT()));
8675 break;
8676 }
8677
8678 if (RegsUsed.count(VA.getLocReg())) {
8679 SDValue &Bits =
8680 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
8681 return Elt.first == VA.getLocReg();
8682 })->second;
8683 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8684 } else {
8685 RetVals.emplace_back(VA.getLocReg(), Arg);
8686 RegsUsed.insert(VA.getLocReg());
8687 }
8688 }
8689
8690 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8691
8692 // Emit SMSTOP before returning from a locally streaming function
8693 SMEAttrs FuncAttrs(MF.getFunction());
8694 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
8695 if (FuncAttrs.hasStreamingCompatibleInterface()) {
8696 Register Reg = FuncInfo->getPStateSMReg();
8697 assert(Reg.isValid() && "PStateSM Register is invalid");
8698 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
8699 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8700 /*Glue*/ SDValue(),
8702 } else
8703 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
8704 /*Glue*/ SDValue(), AArch64SME::Always);
8705 Glue = Chain.getValue(1);
8706 }
8707
8708 SmallVector<SDValue, 4> RetOps(1, Chain);
8709 for (auto &RetVal : RetVals) {
8710 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
8711 isPassedInFPR(RetVal.second.getValueType()))
8712 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8713 RetVal.second.getValueType(), RetVal.second);
8714 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
8715 Glue = Chain.getValue(1);
8716 RetOps.push_back(
8717 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
8718 }
8719
8720 // Windows AArch64 ABIs require that for returning structs by value we copy
8721 // the sret argument into X0 for the return.
8722 // We saved the argument into a virtual register in the entry block,
8723 // so now we copy the value out and into X0.
8724 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
8725 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
8727
8728 unsigned RetValReg = AArch64::X0;
8729 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8730 RetValReg = AArch64::X8;
8731 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
8732 Glue = Chain.getValue(1);
8733
8734 RetOps.push_back(
8735 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
8736 }
8737
8738 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
8739 if (I) {
8740 for (; *I; ++I) {
8741 if (AArch64::GPR64RegClass.contains(*I))
8742 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
8743 else if (AArch64::FPR64RegClass.contains(*I))
8744 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
8745 else
8746 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
8747 }
8748 }
8749
8750 RetOps[0] = Chain; // Update chain.
8751
8752 // Add the glue if we have it.
8753 if (Glue.getNode())
8754 RetOps.push_back(Glue);
8755
8756 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8757 // ARM64EC entry thunks use a special return sequence: instead of a regular
8758 // "ret" instruction, they need to explicitly call the emulator.
8759 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8760 SDValue Arm64ECRetDest =
8761 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
8762 Arm64ECRetDest =
8763 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
8764 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
8766 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
8767 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
8768 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
8769 }
8770
8771 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
8772}
8773
8774//===----------------------------------------------------------------------===//
8775// Other Lowering Code
8776//===----------------------------------------------------------------------===//
8777
8778SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
8779 SelectionDAG &DAG,
8780 unsigned Flag) const {
8781 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
8782 N->getOffset(), Flag);
8783}
8784
8785SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
8786 SelectionDAG &DAG,
8787 unsigned Flag) const {
8788 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
8789}
8790
8791SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
8792 SelectionDAG &DAG,
8793 unsigned Flag) const {
8794 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
8795 N->getOffset(), Flag);
8796}
8797
8798SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
8799 SelectionDAG &DAG,
8800 unsigned Flag) const {
8801 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
8802}
8803
8804SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
8805 SelectionDAG &DAG,
8806 unsigned Flag) const {
8807 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
8808}
8809
8810// (loadGOT sym)
8811template <class NodeTy>
8812SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
8813 unsigned Flags) const {
8814 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
8815 SDLoc DL(N);
8816 EVT Ty = getPointerTy(DAG.getDataLayout());
8817 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
8818 // FIXME: Once remat is capable of dealing with instructions with register
8819 // operands, expand this into two nodes instead of using a wrapper node.
8820 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
8821}
8822
8823// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
8824template <class NodeTy>
8825SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
8826 unsigned Flags) const {
8827 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
8828 SDLoc DL(N);
8829 EVT Ty = getPointerTy(DAG.getDataLayout());
8830 const unsigned char MO_NC = AArch64II::MO_NC;
8831 return DAG.getNode(
8833 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
8834 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
8835 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
8836 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
8837}
8838
8839// (addlow (adrp %hi(sym)) %lo(sym))
8840template <class NodeTy>
8841SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
8842 unsigned Flags) const {
8843 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
8844 SDLoc DL(N);
8845 EVT Ty = getPointerTy(DAG.getDataLayout());
8846 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
8847 SDValue Lo = getTargetNode(N, Ty, DAG,
8850 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
8851}
8852
8853// (adr sym)
8854template <class NodeTy>
8855SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
8856 unsigned Flags) const {
8857 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
8858 SDLoc DL(N);
8859 EVT Ty = getPointerTy(DAG.getDataLayout());
8860 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
8861 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
8862}
8863
8864SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
8865 SelectionDAG &DAG) const {
8866 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
8867 const GlobalValue *GV = GN->getGlobal();
8868 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
8869
8870 if (OpFlags != AArch64II::MO_NO_FLAG)
8871 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
8872 "unexpected offset in global node");
8873
8874 // This also catches the large code model case for Darwin, and tiny code
8875 // model with got relocations.
8876 if ((OpFlags & AArch64II::MO_GOT) != 0) {
8877 return getGOT(GN, DAG, OpFlags);
8878 }
8879
8883 Result = getAddrLarge(GN, DAG, OpFlags);
8884 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8885 Result = getAddrTiny(GN, DAG, OpFlags);
8886 } else {
8887 Result = getAddr(GN, DAG, OpFlags);
8888 }
8889 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8890 SDLoc DL(GN);
8892 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
8894 return Result;
8895}
8896
8897/// Convert a TLS address reference into the correct sequence of loads
8898/// and calls to compute the variable's address (for Darwin, currently) and
8899/// return an SDValue containing the final node.
8900
8901/// Darwin only has one TLS scheme which must be capable of dealing with the
8902/// fully general situation, in the worst case. This means:
8903/// + "extern __thread" declaration.
8904/// + Defined in a possibly unknown dynamic library.
8905///
8906/// The general system is that each __thread variable has a [3 x i64] descriptor
8907/// which contains information used by the runtime to calculate the address. The
8908/// only part of this the compiler needs to know about is the first xword, which
8909/// contains a function pointer that must be called with the address of the
8910/// entire descriptor in "x0".
8911///
8912/// Since this descriptor may be in a different unit, in general even the
8913/// descriptor must be accessed via an indirect load. The "ideal" code sequence
8914/// is:
8915/// adrp x0, _var@TLVPPAGE
8916/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
8917/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
8918/// ; the function pointer
8919/// blr x1 ; Uses descriptor address in x0
8920/// ; Address of _var is now in x0.
8921///
8922/// If the address of _var's descriptor *is* known to the linker, then it can
8923/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
8924/// a slight efficiency gain.
8925SDValue
8926AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
8927 SelectionDAG &DAG) const {
8928 assert(Subtarget->isTargetDarwin() &&
8929 "This function expects a Darwin target");
8930
8931 SDLoc DL(Op);
8932 MVT PtrVT = getPointerTy(DAG.getDataLayout());
8933 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8934 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
8935
8936 SDValue TLVPAddr =
8937 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
8938 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
8939
8940 // The first entry in the descriptor is a function pointer that we must call
8941 // to obtain the address of the variable.
8942 SDValue Chain = DAG.getEntryNode();
8943 SDValue FuncTLVGet = DAG.getLoad(
8944 PtrMemVT, DL, Chain, DescAddr,
8946 Align(PtrMemVT.getSizeInBits() / 8),
8948 Chain = FuncTLVGet.getValue(1);
8949
8950 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
8951 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
8952
8954 MFI.setAdjustsStack(true);
8955
8956 // TLS calls preserve all registers except those that absolutely must be
8957 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
8958 // silly).
8959 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8960 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
8961 if (Subtarget->hasCustomCallingConv())
8962 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
8963
8964 // Finally, we can make the call. This is just a degenerate version of a
8965 // normal AArch64 call node: x0 takes the address of the descriptor, and
8966 // returns the address of the variable in this thread.
8967 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
8968 Chain =
8969 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
8970 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
8971 DAG.getRegisterMask(Mask), Chain.getValue(1));
8972 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
8973}
8974
8975/// Convert a thread-local variable reference into a sequence of instructions to
8976/// compute the variable's address for the local exec TLS model of ELF targets.
8977/// The sequence depends on the maximum TLS area size.
8978SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
8979 SDValue ThreadBase,
8980 const SDLoc &DL,
8981 SelectionDAG &DAG) const {
8982 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8983 SDValue TPOff, Addr;
8984
8985 switch (DAG.getTarget().Options.TLSSize) {
8986 default:
8987 llvm_unreachable("Unexpected TLS size");
8988
8989 case 12: {
8990 // mrs x0, TPIDR_EL0
8991 // add x0, x0, :tprel_lo12:a
8993 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
8994 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
8995 Var,
8996 DAG.getTargetConstant(0, DL, MVT::i32)),
8997 0);
8998 }
8999
9000 case 24: {
9001 // mrs x0, TPIDR_EL0
9002 // add x0, x0, :tprel_hi12:a
9003 // add x0, x0, :tprel_lo12_nc:a
9004 SDValue HiVar = DAG.getTargetGlobalAddress(
9005 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9006 SDValue LoVar = DAG.getTargetGlobalAddress(
9007 GV, DL, PtrVT, 0,
9009 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9010 HiVar,
9011 DAG.getTargetConstant(0, DL, MVT::i32)),
9012 0);
9013 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
9014 LoVar,
9015 DAG.getTargetConstant(0, DL, MVT::i32)),
9016 0);
9017 }
9018
9019 case 32: {
9020 // mrs x1, TPIDR_EL0
9021 // movz x0, #:tprel_g1:a
9022 // movk x0, #:tprel_g0_nc:a
9023 // add x0, x1, x0
9024 SDValue HiVar = DAG.getTargetGlobalAddress(
9025 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
9026 SDValue LoVar = DAG.getTargetGlobalAddress(
9027 GV, DL, PtrVT, 0,
9029 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9030 DAG.getTargetConstant(16, DL, MVT::i32)),
9031 0);
9032 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9033 DAG.getTargetConstant(0, DL, MVT::i32)),
9034 0);
9035 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9036 }
9037
9038 case 48: {
9039 // mrs x1, TPIDR_EL0
9040 // movz x0, #:tprel_g2:a
9041 // movk x0, #:tprel_g1_nc:a
9042 // movk x0, #:tprel_g0_nc:a
9043 // add x0, x1, x0
9044 SDValue HiVar = DAG.getTargetGlobalAddress(
9045 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
9046 SDValue MiVar = DAG.getTargetGlobalAddress(
9047 GV, DL, PtrVT, 0,
9049 SDValue LoVar = DAG.getTargetGlobalAddress(
9050 GV, DL, PtrVT, 0,
9052 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9053 DAG.getTargetConstant(32, DL, MVT::i32)),
9054 0);
9055 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
9056 DAG.getTargetConstant(16, DL, MVT::i32)),
9057 0);
9058 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9059 DAG.getTargetConstant(0, DL, MVT::i32)),
9060 0);
9061 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9062 }
9063 }
9064}
9065
9066/// When accessing thread-local variables under either the general-dynamic or
9067/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9068/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9069/// is a function pointer to carry out the resolution.
9070///
9071/// The sequence is:
9072/// adrp x0, :tlsdesc:var
9073/// ldr x1, [x0, #:tlsdesc_lo12:var]
9074/// add x0, x0, #:tlsdesc_lo12:var
9075/// .tlsdesccall var
9076/// blr x1
9077/// (TPIDR_EL0 offset now in x0)
9078///
9079/// The above sequence must be produced unscheduled, to enable the linker to
9080/// optimize/relax this sequence.
9081/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9082/// above sequence, and expanded really late in the compilation flow, to ensure
9083/// the sequence is produced as per above.
9084SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9085 const SDLoc &DL,
9086 SelectionDAG &DAG) const {
9087 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9088
9089 SDValue Chain = DAG.getEntryNode();
9090 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9091
9092 Chain =
9093 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9094 SDValue Glue = Chain.getValue(1);
9095
9096 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9097}
9098
9099SDValue
9100AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9101 SelectionDAG &DAG) const {
9102 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9103
9104 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9105
9107
9109 if (Model == TLSModel::LocalDynamic)
9111 }
9112
9114 Model != TLSModel::LocalExec)
9115 report_fatal_error("ELF TLS only supported in small memory model or "
9116 "in local exec TLS model");
9117 // Different choices can be made for the maximum size of the TLS area for a
9118 // module. For the small address model, the default TLS size is 16MiB and the
9119 // maximum TLS size is 4GiB.
9120 // FIXME: add tiny and large code model support for TLS access models other
9121 // than local exec. We currently generate the same code as small for tiny,
9122 // which may be larger than needed.
9123
9124 SDValue TPOff;
9125 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9126 SDLoc DL(Op);
9127 const GlobalValue *GV = GA->getGlobal();
9128
9129 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9130
9131 if (Model == TLSModel::LocalExec) {
9132 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9133 } else if (Model == TLSModel::InitialExec) {
9134 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9135 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9136 } else if (Model == TLSModel::LocalDynamic) {
9137 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9138 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9139 // the beginning of the module's TLS region, followed by a DTPREL offset
9140 // calculation.
9141
9142 // These accesses will need deduplicating if there's more than one.
9143 AArch64FunctionInfo *MFI =
9146
9147 // The call needs a relocation too for linker relaxation. It doesn't make
9148 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9149 // the address.
9150 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9152
9153 // Now we can calculate the offset from TPIDR_EL0 to this module's
9154 // thread-local area.
9155 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9156
9157 // Now use :dtprel_whatever: operations to calculate this variable's offset
9158 // in its thread-storage area.
9159 SDValue HiVar = DAG.getTargetGlobalAddress(
9160 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9161 SDValue LoVar = DAG.getTargetGlobalAddress(
9162 GV, DL, MVT::i64, 0,
9164
9165 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9166 DAG.getTargetConstant(0, DL, MVT::i32)),
9167 0);
9168 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9169 DAG.getTargetConstant(0, DL, MVT::i32)),
9170 0);
9171 } else if (Model == TLSModel::GeneralDynamic) {
9172 // The call needs a relocation too for linker relaxation. It doesn't make
9173 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9174 // the address.
9175 SDValue SymAddr =
9176 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9177
9178 // Finally we can make a call to calculate the offset from tpidr_el0.
9179 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9180 } else
9181 llvm_unreachable("Unsupported ELF TLS access model");
9182
9183 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9184}
9185
9186SDValue
9187AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9188 SelectionDAG &DAG) const {
9189 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9190
9191 SDValue Chain = DAG.getEntryNode();
9192 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9193 SDLoc DL(Op);
9194
9195 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9196
9197 // Load the ThreadLocalStoragePointer from the TEB
9198 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9199 SDValue TLSArray =
9200 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9201 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9202 Chain = TLSArray.getValue(1);
9203
9204 // Load the TLS index from the C runtime;
9205 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9206 // This also does the same as LOADgot, but using a generic i32 load,
9207 // while LOADgot only loads i64.
9208 SDValue TLSIndexHi =
9209 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9210 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9211 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9212 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9213 SDValue TLSIndex =
9214 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9215 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9216 Chain = TLSIndex.getValue(1);
9217
9218 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9219 // offset into the TLSArray.
9220 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9221 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9222 DAG.getConstant(3, DL, PtrVT));
9223 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9224 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9226 Chain = TLS.getValue(1);
9227
9228 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9229 const GlobalValue *GV = GA->getGlobal();
9230 SDValue TGAHi = DAG.getTargetGlobalAddress(
9231 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9232 SDValue TGALo = DAG.getTargetGlobalAddress(
9233 GV, DL, PtrVT, 0,
9235
9236 // Add the offset from the start of the .tls section (section base).
9237 SDValue Addr =
9238 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9239 DAG.getTargetConstant(0, DL, MVT::i32)),
9240 0);
9241 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9242 return Addr;
9243}
9244
9245SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9246 SelectionDAG &DAG) const {
9247 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9248 if (DAG.getTarget().useEmulatedTLS())
9249 return LowerToTLSEmulatedModel(GA, DAG);
9250
9251 if (Subtarget->isTargetDarwin())
9252 return LowerDarwinGlobalTLSAddress(Op, DAG);
9253 if (Subtarget->isTargetELF())
9254 return LowerELFGlobalTLSAddress(Op, DAG);
9255 if (Subtarget->isTargetWindows())
9256 return LowerWindowsGlobalTLSAddress(Op, DAG);
9257
9258 llvm_unreachable("Unexpected platform trying to use TLS");
9259}
9260
9261// Looks through \param Val to determine the bit that can be used to
9262// check the sign of the value. It returns the unextended value and
9263// the sign bit position.
9264std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9265 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9266 return {Val.getOperand(0),
9267 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9268 1};
9269
9270 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9271 return {Val.getOperand(0),
9272 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9273
9274 return {Val, Val.getValueSizeInBits() - 1};
9275}
9276
9277SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9278 SDValue Chain = Op.getOperand(0);
9279 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9280 SDValue LHS = Op.getOperand(2);
9281 SDValue RHS = Op.getOperand(3);
9282 SDValue Dest = Op.getOperand(4);
9283 SDLoc dl(Op);
9284
9286 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9287 // will not be produced, as they are conditional branch instructions that do
9288 // not set flags.
9289 bool ProduceNonFlagSettingCondBr =
9290 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9291
9292 // Handle f128 first, since lowering it will result in comparing the return
9293 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9294 // is expecting to deal with.
9295 if (LHS.getValueType() == MVT::f128) {
9296 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9297
9298 // If softenSetCCOperands returned a scalar, we need to compare the result
9299 // against zero to select between true and false values.
9300 if (!RHS.getNode()) {
9301 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9302 CC = ISD::SETNE;
9303 }
9304 }
9305
9306 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9307 // instruction.
9308 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9309 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9310 // Only lower legal XALUO ops.
9311 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9312 return SDValue();
9313
9314 // The actual operation with overflow check.
9316 SDValue Value, Overflow;
9317 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9318
9319 if (CC == ISD::SETNE)
9320 OFCC = getInvertedCondCode(OFCC);
9321 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9322
9323 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9324 Overflow);
9325 }
9326
9327 if (LHS.getValueType().isInteger()) {
9328 assert((LHS.getValueType() == RHS.getValueType()) &&
9329 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9330
9331 // If the RHS of the comparison is zero, we can potentially fold this
9332 // to a specialized branch.
9333 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9334 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9335 if (CC == ISD::SETEQ) {
9336 // See if we can use a TBZ to fold in an AND as well.
9337 // TBZ has a smaller branch displacement than CBZ. If the offset is
9338 // out of bounds, a late MI-layer pass rewrites branches.
9339 // 403.gcc is an example that hits this case.
9340 if (LHS.getOpcode() == ISD::AND &&
9341 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9342 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9343 SDValue Test = LHS.getOperand(0);
9344 uint64_t Mask = LHS.getConstantOperandVal(1);
9345 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9346 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9347 Dest);
9348 }
9349
9350 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9351 } else if (CC == ISD::SETNE) {
9352 // See if we can use a TBZ to fold in an AND as well.
9353 // TBZ has a smaller branch displacement than CBZ. If the offset is
9354 // out of bounds, a late MI-layer pass rewrites branches.
9355 // 403.gcc is an example that hits this case.
9356 if (LHS.getOpcode() == ISD::AND &&
9357 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9358 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9359 SDValue Test = LHS.getOperand(0);
9360 uint64_t Mask = LHS.getConstantOperandVal(1);
9361 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9362 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9363 Dest);
9364 }
9365
9366 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9367 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9368 // Don't combine AND since emitComparison converts the AND to an ANDS
9369 // (a.k.a. TST) and the test in the test bit and branch instruction
9370 // becomes redundant. This would also increase register pressure.
9371 uint64_t SignBitPos;
9372 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9373 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9374 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9375 }
9376 }
9377 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9378 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9379 // Don't combine AND since emitComparison converts the AND to an ANDS
9380 // (a.k.a. TST) and the test in the test bit and branch instruction
9381 // becomes redundant. This would also increase register pressure.
9382 uint64_t SignBitPos;
9383 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9384 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9385 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9386 }
9387
9388 SDValue CCVal;
9389 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9390 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9391 Cmp);
9392 }
9393
9394 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9395 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9396
9397 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9398 // clean. Some of them require two branches to implement.
9399 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9400 AArch64CC::CondCode CC1, CC2;
9401 changeFPCCToAArch64CC(CC, CC1, CC2);
9402 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9403 SDValue BR1 =
9404 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9405 if (CC2 != AArch64CC::AL) {
9406 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9407 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9408 Cmp);
9409 }
9410
9411 return BR1;
9412}
9413
9414SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9415 SelectionDAG &DAG) const {
9416 if (!Subtarget->hasNEON())
9417 return SDValue();
9418
9419 EVT VT = Op.getValueType();
9420 EVT IntVT = VT.changeTypeToInteger();
9421 SDLoc DL(Op);
9422
9423 SDValue In1 = Op.getOperand(0);
9424 SDValue In2 = Op.getOperand(1);
9425 EVT SrcVT = In2.getValueType();
9426
9427 if (!SrcVT.bitsEq(VT))
9428 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9429
9430 if (VT.isScalableVector())
9431 IntVT =
9433
9434 if (VT.isFixedLengthVector() &&
9435 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9436 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9437
9438 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9439 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9440
9441 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9442 return convertFromScalableVector(DAG, VT, Res);
9443 }
9444
9445 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9446 if (VT.isScalableVector())
9447 return getSVESafeBitCast(VT, Op, DAG);
9448
9449 return DAG.getBitcast(VT, Op);
9450 };
9451
9452 SDValue VecVal1, VecVal2;
9453 EVT VecVT;
9454 auto SetVecVal = [&](int Idx = -1) {
9455 if (!VT.isVector()) {
9456 VecVal1 =
9457 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9458 VecVal2 =
9459 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9460 } else {
9461 VecVal1 = BitCast(VecVT, In1, DAG);
9462 VecVal2 = BitCast(VecVT, In2, DAG);
9463 }
9464 };
9465 if (VT.isVector()) {
9466 VecVT = IntVT;
9467 SetVecVal();
9468 } else if (VT == MVT::f64) {
9469 VecVT = MVT::v2i64;
9470 SetVecVal(AArch64::dsub);
9471 } else if (VT == MVT::f32) {
9472 VecVT = MVT::v4i32;
9473 SetVecVal(AArch64::ssub);
9474 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9475 VecVT = MVT::v8i16;
9476 SetVecVal(AArch64::hsub);
9477 } else {
9478 llvm_unreachable("Invalid type for copysign!");
9479 }
9480
9481 unsigned BitWidth = In1.getScalarValueSizeInBits();
9482 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
9483
9484 // We want to materialize a mask with every bit but the high bit set, but the
9485 // AdvSIMD immediate moves cannot materialize that in a single instruction for
9486 // 64-bit elements. Instead, materialize all bits set and then negate that.
9487 if (VT == MVT::f64 || VT == MVT::v2f64) {
9488 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
9489 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
9490 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
9491 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
9492 }
9493
9494 SDValue BSP =
9495 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
9496 if (VT == MVT::f16 || VT == MVT::bf16)
9497 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
9498 if (VT == MVT::f32)
9499 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
9500 if (VT == MVT::f64)
9501 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
9502
9503 return BitCast(VT, BSP, DAG);
9504}
9505
9506SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
9507 SelectionDAG &DAG) const {
9509 Attribute::NoImplicitFloat))
9510 return SDValue();
9511
9512 if (!Subtarget->hasNEON())
9513 return SDValue();
9514
9515 bool IsParity = Op.getOpcode() == ISD::PARITY;
9516 SDValue Val = Op.getOperand(0);
9517 SDLoc DL(Op);
9518 EVT VT = Op.getValueType();
9519
9520 // for i32, general parity function using EORs is more efficient compared to
9521 // using floating point
9522 if (VT == MVT::i32 && IsParity)
9523 return SDValue();
9524
9525 // If there is no CNT instruction available, GPR popcount can
9526 // be more efficiently lowered to the following sequence that uses
9527 // AdvSIMD registers/instructions as long as the copies to/from
9528 // the AdvSIMD registers are cheap.
9529 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
9530 // CNT V0.8B, V0.8B // 8xbyte pop-counts
9531 // ADDV B0, V0.8B // sum 8xbyte pop-counts
9532 // UMOV X0, V0.B[0] // copy byte result back to integer reg
9533 if (VT == MVT::i32 || VT == MVT::i64) {
9534 if (VT == MVT::i32)
9535 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
9536 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
9537
9538 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
9539 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9540 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9541 DAG.getConstant(0, DL, MVT::i64));
9542
9543 if (IsParity)
9544 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9545 DAG.getConstant(1, DL, MVT::i32));
9546
9547 if (VT == MVT::i64)
9548 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
9549 return UaddLV;
9550 } else if (VT == MVT::i128) {
9551 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
9552
9553 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
9554 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
9555 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
9556 DAG.getConstant(0, DL, MVT::i64));
9557
9558 if (IsParity)
9559 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
9560 DAG.getConstant(1, DL, MVT::i32));
9561
9562 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
9563 }
9564
9565 assert(!IsParity && "ISD::PARITY of vector types not supported");
9566
9567 if (VT.isScalableVector() ||
9569 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
9570
9571 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
9572 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
9573 "Unexpected type for custom ctpop lowering");
9574
9575 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
9576 Val = DAG.getBitcast(VT8Bit, Val);
9577 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
9578
9579 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
9580 unsigned EltSize = 8;
9581 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
9582 while (EltSize != VT.getScalarSizeInBits()) {
9583 EltSize *= 2;
9584 NumElts /= 2;
9585 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
9586 Val = DAG.getNode(
9587 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
9588 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
9589 }
9590
9591 return Val;
9592}
9593
9594SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
9595 EVT VT = Op.getValueType();
9596 assert(VT.isScalableVector() ||
9598 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
9599
9600 SDLoc DL(Op);
9601 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
9602 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
9603}
9604
9605SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
9606 SelectionDAG &DAG) const {
9607
9608 EVT VT = Op.getValueType();
9609 SDLoc DL(Op);
9610 unsigned Opcode = Op.getOpcode();
9612 switch (Opcode) {
9613 default:
9614 llvm_unreachable("Wrong instruction");
9615 case ISD::SMAX:
9616 CC = ISD::SETGT;
9617 break;
9618 case ISD::SMIN:
9619 CC = ISD::SETLT;
9620 break;
9621 case ISD::UMAX:
9622 CC = ISD::SETUGT;
9623 break;
9624 case ISD::UMIN:
9625 CC = ISD::SETULT;
9626 break;
9627 }
9628
9629 if (VT.isScalableVector() ||
9631 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
9632 switch (Opcode) {
9633 default:
9634 llvm_unreachable("Wrong instruction");
9635 case ISD::SMAX:
9636 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
9637 case ISD::SMIN:
9638 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
9639 case ISD::UMAX:
9640 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
9641 case ISD::UMIN:
9642 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
9643 }
9644 }
9645
9646 SDValue Op0 = Op.getOperand(0);
9647 SDValue Op1 = Op.getOperand(1);
9648 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
9649 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
9650}
9651
9652SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
9653 SelectionDAG &DAG) const {
9654 EVT VT = Op.getValueType();
9655
9656 if (VT.isScalableVector() ||
9658 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
9659 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
9660
9661 SDLoc DL(Op);
9662 SDValue REVB;
9663 MVT VST;
9664
9665 switch (VT.getSimpleVT().SimpleTy) {
9666 default:
9667 llvm_unreachable("Invalid type for bitreverse!");
9668
9669 case MVT::v2i32: {
9670 VST = MVT::v8i8;
9671 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9672
9673 break;
9674 }
9675
9676 case MVT::v4i32: {
9677 VST = MVT::v16i8;
9678 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
9679
9680 break;
9681 }
9682
9683 case MVT::v1i64: {
9684 VST = MVT::v8i8;
9685 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9686
9687 break;
9688 }
9689
9690 case MVT::v2i64: {
9691 VST = MVT::v16i8;
9692 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
9693
9694 break;
9695 }
9696 }
9697
9698 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
9699 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
9700}
9701
9702// Check whether the continuous comparison sequence.
9703static bool
9704isOrXorChain(SDValue N, unsigned &Num,
9705 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
9706 if (Num == MaxXors)
9707 return false;
9708
9709 // Skip the one-use zext
9710 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
9711 N = N->getOperand(0);
9712
9713 // The leaf node must be XOR
9714 if (N->getOpcode() == ISD::XOR) {
9715 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
9716 Num++;
9717 return true;
9718 }
9719
9720 // All the non-leaf nodes must be OR.
9721 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
9722 return false;
9723
9724 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
9725 isOrXorChain(N->getOperand(1), Num, WorkList))
9726 return true;
9727 return false;
9728}
9729
9730// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
9732 SDValue LHS = N->getOperand(0);
9733 SDValue RHS = N->getOperand(1);
9734 SDLoc DL(N);
9735 EVT VT = N->getValueType(0);
9737
9738 // Only handle integer compares.
9739 if (N->getOpcode() != ISD::SETCC)
9740 return SDValue();
9741
9742 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
9743 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
9744 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
9745 unsigned NumXors = 0;
9746 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
9747 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
9748 isOrXorChain(LHS, NumXors, WorkList)) {
9749 SDValue XOR0, XOR1;
9750 std::tie(XOR0, XOR1) = WorkList[0];
9751 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
9752 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9753 for (unsigned I = 1; I < WorkList.size(); I++) {
9754 std::tie(XOR0, XOR1) = WorkList[I];
9755 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
9756 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
9757 }
9758
9759 // Exit early by inverting the condition, which help reduce indentations.
9760 return Cmp;
9761 }
9762
9763 return SDValue();
9764}
9765
9766SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
9767
9768 if (Op.getValueType().isVector())
9769 return LowerVSETCC(Op, DAG);
9770
9771 bool IsStrict = Op->isStrictFPOpcode();
9772 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
9773 unsigned OpNo = IsStrict ? 1 : 0;
9774 SDValue Chain;
9775 if (IsStrict)
9776 Chain = Op.getOperand(0);
9777 SDValue LHS = Op.getOperand(OpNo + 0);
9778 SDValue RHS = Op.getOperand(OpNo + 1);
9779 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
9780 SDLoc dl(Op);
9781
9782 // We chose ZeroOrOneBooleanContents, so use zero and one.
9783 EVT VT = Op.getValueType();
9784 SDValue TVal = DAG.getConstant(1, dl, VT);
9785 SDValue FVal = DAG.getConstant(0, dl, VT);
9786
9787 // Handle f128 first, since one possible outcome is a normal integer
9788 // comparison which gets picked up by the next if statement.
9789 if (LHS.getValueType() == MVT::f128) {
9790 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
9791 IsSignaling);
9792
9793 // If softenSetCCOperands returned a scalar, use it.
9794 if (!RHS.getNode()) {
9795 assert(LHS.getValueType() == Op.getValueType() &&
9796 "Unexpected setcc expansion!");
9797 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
9798 }
9799 }
9800
9801 if (LHS.getValueType().isInteger()) {
9802 SDValue CCVal;
9804 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
9805
9806 // Note that we inverted the condition above, so we reverse the order of
9807 // the true and false operands here. This will allow the setcc to be
9808 // matched to a single CSINC instruction.
9809 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
9810 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
9811 }
9812
9813 // Now we know we're dealing with FP values.
9814 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
9815 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9816
9817 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
9818 // and do the comparison.
9819 SDValue Cmp;
9820 if (IsStrict)
9821 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
9822 else
9823 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9824
9825 AArch64CC::CondCode CC1, CC2;
9826 changeFPCCToAArch64CC(CC, CC1, CC2);
9827 SDValue Res;
9828 if (CC2 == AArch64CC::AL) {
9829 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
9830 CC2);
9831 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9832
9833 // Note that we inverted the condition above, so we reverse the order of
9834 // the true and false operands here. This will allow the setcc to be
9835 // matched to a single CSINC instruction.
9836 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
9837 } else {
9838 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
9839 // totally clean. Some of them require two CSELs to implement. As is in
9840 // this case, we emit the first CSEL and then emit a second using the output
9841 // of the first as the RHS. We're effectively OR'ing the two CC's together.
9842
9843 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
9844 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9845 SDValue CS1 =
9846 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
9847
9848 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9849 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
9850 }
9851 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
9852}
9853
9854SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
9855 SelectionDAG &DAG) const {
9856
9857 SDValue LHS = Op.getOperand(0);
9858 SDValue RHS = Op.getOperand(1);
9859 EVT VT = LHS.getValueType();
9860 if (VT != MVT::i32 && VT != MVT::i64)
9861 return SDValue();
9862
9863 SDLoc DL(Op);
9864 SDValue Carry = Op.getOperand(2);
9865 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
9866 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
9867 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
9868 LHS, RHS, InvCarry);
9869
9870 EVT OpVT = Op.getValueType();
9871 SDValue TVal = DAG.getConstant(1, DL, OpVT);
9872 SDValue FVal = DAG.getConstant(0, DL, OpVT);
9873
9874 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
9876 SDValue CCVal =
9877 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
9878 // Inputs are swapped because the condition is inverted. This will allow
9879 // matching with a single CSINC instruction.
9880 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
9881 Cmp.getValue(1));
9882}
9883
9884SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
9885 SDValue RHS, SDValue TVal,
9886 SDValue FVal, const SDLoc &dl,
9887 SelectionDAG &DAG) const {
9888 // Handle f128 first, because it will result in a comparison of some RTLIB
9889 // call result against zero.
9890 if (LHS.getValueType() == MVT::f128) {
9891 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9892
9893 // If softenSetCCOperands returned a scalar, we need to compare the result
9894 // against zero to select between true and false values.
9895 if (!RHS.getNode()) {
9896 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9897 CC = ISD::SETNE;
9898 }
9899 }
9900
9901 // Also handle f16, for which we need to do a f32 comparison.
9902 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
9903 LHS.getValueType() == MVT::bf16) {
9904 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
9905 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
9906 }
9907
9908 // Next, handle integers.
9909 if (LHS.getValueType().isInteger()) {
9910 assert((LHS.getValueType() == RHS.getValueType()) &&
9911 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9912
9913 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
9914 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
9915 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9916 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
9917 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
9918 // supported types.
9919 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
9920 CTVal->isOne() && CFVal->isAllOnes() &&
9921 LHS.getValueType() == TVal.getValueType()) {
9922 EVT VT = LHS.getValueType();
9923 SDValue Shift =
9924 DAG.getNode(ISD::SRA, dl, VT, LHS,
9925 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9926 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
9927 }
9928
9929 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
9930 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
9931 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
9932 // Both require less instructions than compare and conditional select.
9933 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
9934 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
9935 LHS.getValueType() == RHS.getValueType()) {
9936 EVT VT = LHS.getValueType();
9937 SDValue Shift =
9938 DAG.getNode(ISD::SRA, dl, VT, LHS,
9939 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
9940
9941 if (CC == ISD::SETGT)
9942 Shift = DAG.getNOT(dl, Shift, VT);
9943
9944 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
9945 }
9946
9947 unsigned Opcode = AArch64ISD::CSEL;
9948
9949 // If both the TVal and the FVal are constants, see if we can swap them in
9950 // order to for a CSINV or CSINC out of them.
9951 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
9952 std::swap(TVal, FVal);
9953 std::swap(CTVal, CFVal);
9954 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9955 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
9956 std::swap(TVal, FVal);
9957 std::swap(CTVal, CFVal);
9958 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9959 } else if (TVal.getOpcode() == ISD::XOR) {
9960 // If TVal is a NOT we want to swap TVal and FVal so that we can match
9961 // with a CSINV rather than a CSEL.
9962 if (isAllOnesConstant(TVal.getOperand(1))) {
9963 std::swap(TVal, FVal);
9964 std::swap(CTVal, CFVal);
9965 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9966 }
9967 } else if (TVal.getOpcode() == ISD::SUB) {
9968 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
9969 // that we can match with a CSNEG rather than a CSEL.
9970 if (isNullConstant(TVal.getOperand(0))) {
9971 std::swap(TVal, FVal);
9972 std::swap(CTVal, CFVal);
9973 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
9974 }
9975 } else if (CTVal && CFVal) {
9976 const int64_t TrueVal = CTVal->getSExtValue();
9977 const int64_t FalseVal = CFVal->getSExtValue();
9978 bool Swap = false;
9979
9980 // If both TVal and FVal are constants, see if FVal is the
9981 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
9982 // instead of a CSEL in that case.
9983 if (TrueVal == ~FalseVal) {
9984 Opcode = AArch64ISD::CSINV;
9985 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
9986 TrueVal == -FalseVal) {
9987 Opcode = AArch64ISD::CSNEG;
9988 } else if (TVal.getValueType() == MVT::i32) {
9989 // If our operands are only 32-bit wide, make sure we use 32-bit
9990 // arithmetic for the check whether we can use CSINC. This ensures that
9991 // the addition in the check will wrap around properly in case there is
9992 // an overflow (which would not be the case if we do the check with
9993 // 64-bit arithmetic).
9994 const uint32_t TrueVal32 = CTVal->getZExtValue();
9995 const uint32_t FalseVal32 = CFVal->getZExtValue();
9996
9997 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
9998 Opcode = AArch64ISD::CSINC;
9999
10000 if (TrueVal32 > FalseVal32) {
10001 Swap = true;
10002 }
10003 }
10004 } else {
10005 // 64-bit check whether we can use CSINC.
10006 const uint64_t TrueVal64 = TrueVal;
10007 const uint64_t FalseVal64 = FalseVal;
10008
10009 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
10010 Opcode = AArch64ISD::CSINC;
10011
10012 if (TrueVal > FalseVal) {
10013 Swap = true;
10014 }
10015 }
10016 }
10017
10018 // Swap TVal and FVal if necessary.
10019 if (Swap) {
10020 std::swap(TVal, FVal);
10021 std::swap(CTVal, CFVal);
10022 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10023 }
10024
10025 if (Opcode != AArch64ISD::CSEL) {
10026 // Drop FVal since we can get its value by simply inverting/negating
10027 // TVal.
10028 FVal = TVal;
10029 }
10030 }
10031
10032 // Avoid materializing a constant when possible by reusing a known value in
10033 // a register. However, don't perform this optimization if the known value
10034 // is one, zero or negative one in the case of a CSEL. We can always
10035 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
10036 // FVal, respectively.
10037 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
10038 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
10039 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
10041 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
10042 // "a != C ? x : a" to avoid materializing C.
10043 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
10044 TVal = LHS;
10045 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
10046 FVal = LHS;
10047 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
10048 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
10049 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
10050 // avoid materializing C.
10052 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
10053 Opcode = AArch64ISD::CSINV;
10054 TVal = LHS;
10055 FVal = DAG.getConstant(0, dl, FVal.getValueType());
10056 }
10057 }
10058
10059 SDValue CCVal;
10060 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10061 EVT VT = TVal.getValueType();
10062 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
10063 }
10064
10065 // Now we know we're dealing with FP values.
10066 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
10067 LHS.getValueType() == MVT::f64);
10068 assert(LHS.getValueType() == RHS.getValueType());
10069 EVT VT = TVal.getValueType();
10070 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10071
10072 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10073 // clean. Some of them require two CSELs to implement.
10074 AArch64CC::CondCode CC1, CC2;
10075 changeFPCCToAArch64CC(CC, CC1, CC2);
10076
10077 if (DAG.getTarget().Options.UnsafeFPMath) {
10078 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10079 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10080 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10081 if (RHSVal && RHSVal->isZero()) {
10082 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10083 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10084
10085 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10086 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10087 TVal = LHS;
10088 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10089 CFVal && CFVal->isZero() &&
10090 FVal.getValueType() == LHS.getValueType())
10091 FVal = LHS;
10092 }
10093 }
10094
10095 // Emit first, and possibly only, CSEL.
10096 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10097 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10098
10099 // If we need a second CSEL, emit it, using the output of the first as the
10100 // RHS. We're effectively OR'ing the two CC's together.
10101 if (CC2 != AArch64CC::AL) {
10102 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10103 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10104 }
10105
10106 // Otherwise, return the output of the first CSEL.
10107 return CS1;
10108}
10109
10110SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10111 SelectionDAG &DAG) const {
10112 EVT Ty = Op.getValueType();
10113 auto Idx = Op.getConstantOperandAPInt(2);
10114 int64_t IdxVal = Idx.getSExtValue();
10115 assert(Ty.isScalableVector() &&
10116 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10117
10118 // We can use the splice instruction for certain index values where we are
10119 // able to efficiently generate the correct predicate. The index will be
10120 // inverted and used directly as the input to the ptrue instruction, i.e.
10121 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10122 // splice predicate. However, we can only do this if we can guarantee that
10123 // there are enough elements in the vector, hence we check the index <= min
10124 // number of elements.
10125 std::optional<unsigned> PredPattern;
10126 if (Ty.isScalableVector() && IdxVal < 0 &&
10127 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10128 std::nullopt) {
10129 SDLoc DL(Op);
10130
10131 // Create a predicate where all but the last -IdxVal elements are false.
10132 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10133 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10134 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10135
10136 // Now splice the two inputs together using the predicate.
10137 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10138 Op.getOperand(1));
10139 }
10140
10141 // We can select to an EXT instruction when indexing the first 256 bytes.
10143 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
10144 return Op;
10145
10146 return SDValue();
10147}
10148
10149SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10150 SelectionDAG &DAG) const {
10151 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10152 SDValue LHS = Op.getOperand(0);
10153 SDValue RHS = Op.getOperand(1);
10154 SDValue TVal = Op.getOperand(2);
10155 SDValue FVal = Op.getOperand(3);
10156 SDLoc DL(Op);
10157 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10158}
10159
10160SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10161 SelectionDAG &DAG) const {
10162 SDValue CCVal = Op->getOperand(0);
10163 SDValue TVal = Op->getOperand(1);
10164 SDValue FVal = Op->getOperand(2);
10165 SDLoc DL(Op);
10166
10167 EVT Ty = Op.getValueType();
10168 if (Ty == MVT::aarch64svcount) {
10169 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10170 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10171 SDValue Sel =
10172 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10173 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10174 }
10175
10176 if (Ty.isScalableVector()) {
10177 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10178 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10179 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10180 }
10181
10182 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10183 // FIXME: Ideally this would be the same as above using i1 types, however
10184 // for the moment we can't deal with fixed i1 vector types properly, so
10185 // instead extend the predicate to a result type sized integer vector.
10186 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10187 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10188 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10189 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10190 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10191 }
10192
10193 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10194 // instruction.
10195 if (ISD::isOverflowIntrOpRes(CCVal)) {
10196 // Only lower legal XALUO ops.
10197 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10198 return SDValue();
10199
10201 SDValue Value, Overflow;
10202 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10203 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10204
10205 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10206 CCVal, Overflow);
10207 }
10208
10209 // Lower it the same way as we would lower a SELECT_CC node.
10211 SDValue LHS, RHS;
10212 if (CCVal.getOpcode() == ISD::SETCC) {
10213 LHS = CCVal.getOperand(0);
10214 RHS = CCVal.getOperand(1);
10215 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10216 } else {
10217 LHS = CCVal;
10218 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10219 CC = ISD::SETNE;
10220 }
10221
10222 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10223 // order to use FCSELSrrr
10224 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10225 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10226 DAG.getUNDEF(MVT::f32), TVal);
10227 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10228 DAG.getUNDEF(MVT::f32), FVal);
10229 }
10230
10231 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10232
10233 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10234 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10235 }
10236
10237 return Res;
10238}
10239
10240SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10241 SelectionDAG &DAG) const {
10242 // Jump table entries as PC relative offsets. No additional tweaking
10243 // is necessary here. Just get the address of the jump table.
10244 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10245
10248 !Subtarget->isTargetMachO())
10249 return getAddrLarge(JT, DAG);
10250 if (CM == CodeModel::Tiny)
10251 return getAddrTiny(JT, DAG);
10252 return getAddr(JT, DAG);
10253}
10254
10255SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10256 SelectionDAG &DAG) const {
10257 // Jump table entries as PC relative offsets. No additional tweaking
10258 // is necessary here. Just get the address of the jump table.
10259 SDLoc DL(Op);
10260 SDValue JT = Op.getOperand(1);
10261 SDValue Entry = Op.getOperand(2);
10262 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10263
10264 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10265 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10266
10267 SDNode *Dest =
10268 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10269 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10270 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10271 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10272}
10273
10274SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10275 SelectionDAG &DAG) const {
10276 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10278 if (CM == CodeModel::Large) {
10279 // Use the GOT for the large code model on iOS.
10280 if (Subtarget->isTargetMachO()) {
10281 return getGOT(CP, DAG);
10282 }
10284 return getAddrLarge(CP, DAG);
10285 } else if (CM == CodeModel::Tiny) {
10286 return getAddrTiny(CP, DAG);
10287 }
10288 return getAddr(CP, DAG);
10289}
10290
10291SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10292 SelectionDAG &DAG) const {
10293 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
10295 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10297 return getAddrLarge(BA, DAG);
10298 } else if (CM == CodeModel::Tiny) {
10299 return getAddrTiny(BA, DAG);
10300 }
10301 return getAddr(BA, DAG);
10302}
10303
10304SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10305 SelectionDAG &DAG) const {
10306 AArch64FunctionInfo *FuncInfo =
10308
10309 SDLoc DL(Op);
10310 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10312 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10313 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10314 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10315 MachinePointerInfo(SV));
10316}
10317
10318SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10319 SelectionDAG &DAG) const {
10322
10323 SDLoc DL(Op);
10324 SDValue FR;
10325 if (Subtarget->isWindowsArm64EC()) {
10326 // With the Arm64EC ABI, we compute the address of the varargs save area
10327 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10328 // but calls from an entry thunk can pass in a different address.
10329 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10330 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10332 if (FuncInfo->getVarArgsGPRSize() > 0)
10333 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10334 else
10335 StackOffset = FuncInfo->getVarArgsStackOffset();
10336 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10337 DAG.getConstant(StackOffset, DL, MVT::i64));
10338 } else {
10339 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10340 ? FuncInfo->getVarArgsGPRIndex()
10341 : FuncInfo->getVarArgsStackIndex(),
10343 }
10344 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10345 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10346 MachinePointerInfo(SV));
10347}
10348
10349SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10350 SelectionDAG &DAG) const {
10351 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10352 // Standard, section B.3.
10355 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10356 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10357 auto PtrVT = getPointerTy(DAG.getDataLayout());
10358 SDLoc DL(Op);
10359
10360 SDValue Chain = Op.getOperand(0);
10361 SDValue VAList = Op.getOperand(1);
10362 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10364
10365 // void *__stack at offset 0
10366 unsigned Offset = 0;
10367 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10368 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10369 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10370 MachinePointerInfo(SV), Align(PtrSize)));
10371
10372 // void *__gr_top at offset 8 (4 on ILP32)
10373 Offset += PtrSize;
10374 int GPRSize = FuncInfo->getVarArgsGPRSize();
10375 if (GPRSize > 0) {
10376 SDValue GRTop, GRTopAddr;
10377
10378 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10379 DAG.getConstant(Offset, DL, PtrVT));
10380
10381 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10382 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10383 DAG.getConstant(GPRSize, DL, PtrVT));
10384 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10385
10386 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
10388 Align(PtrSize)));
10389 }
10390
10391 // void *__vr_top at offset 16 (8 on ILP32)
10392 Offset += PtrSize;
10393 int FPRSize = FuncInfo->getVarArgsFPRSize();
10394 if (FPRSize > 0) {
10395 SDValue VRTop, VRTopAddr;
10396 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10397 DAG.getConstant(Offset, DL, PtrVT));
10398
10399 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
10400 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
10401 DAG.getConstant(FPRSize, DL, PtrVT));
10402 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
10403
10404 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
10406 Align(PtrSize)));
10407 }
10408
10409 // int __gr_offs at offset 24 (12 on ILP32)
10410 Offset += PtrSize;
10411 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10412 DAG.getConstant(Offset, DL, PtrVT));
10413 MemOps.push_back(
10414 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
10415 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10416
10417 // int __vr_offs at offset 28 (16 on ILP32)
10418 Offset += 4;
10419 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10420 DAG.getConstant(Offset, DL, PtrVT));
10421 MemOps.push_back(
10422 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
10423 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
10424
10425 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
10426}
10427
10428SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
10429 SelectionDAG &DAG) const {
10431
10432 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
10433 return LowerWin64_VASTART(Op, DAG);
10434 else if (Subtarget->isTargetDarwin())
10435 return LowerDarwin_VASTART(Op, DAG);
10436 else
10437 return LowerAAPCS_VASTART(Op, DAG);
10438}
10439
10440SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
10441 SelectionDAG &DAG) const {
10442 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
10443 // pointer.
10444 SDLoc DL(Op);
10445 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10446 unsigned VaListSize =
10447 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
10448 ? PtrSize
10449 : Subtarget->isTargetILP32() ? 20 : 32;
10450 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
10451 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
10452
10453 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
10454 DAG.getConstant(VaListSize, DL, MVT::i32),
10455 Align(PtrSize), false, false, false,
10456 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
10457}
10458
10459SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
10460 assert(Subtarget->isTargetDarwin() &&
10461 "automatic va_arg instruction only works on Darwin");
10462
10463 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10464 EVT VT = Op.getValueType();
10465 SDLoc DL(Op);
10466 SDValue Chain = Op.getOperand(0);
10467 SDValue Addr = Op.getOperand(1);
10468 MaybeAlign Align(Op.getConstantOperandVal(3));
10469 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
10470 auto PtrVT = getPointerTy(DAG.getDataLayout());
10471 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10472 SDValue VAList =
10473 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
10474 Chain = VAList.getValue(1);
10475 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
10476
10477 if (VT.isScalableVector())
10478 report_fatal_error("Passing SVE types to variadic functions is "
10479 "currently not supported");
10480
10481 if (Align && *Align > MinSlotSize) {
10482 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10483 DAG.getConstant(Align->value() - 1, DL, PtrVT));
10484 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
10485 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
10486 }
10487
10488 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
10489 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
10490
10491 // Scalar integer and FP values smaller than 64 bits are implicitly extended
10492 // up to 64 bits. At the very least, we have to increase the striding of the
10493 // vaargs list to match this, and for FP values we need to introduce
10494 // FP_ROUND nodes as well.
10495 if (VT.isInteger() && !VT.isVector())
10496 ArgSize = std::max(ArgSize, MinSlotSize);
10497 bool NeedFPTrunc = false;
10498 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
10499 ArgSize = 8;
10500 NeedFPTrunc = true;
10501 }
10502
10503 // Increment the pointer, VAList, to the next vaarg
10504 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10505 DAG.getConstant(ArgSize, DL, PtrVT));
10506 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
10507
10508 // Store the incremented VAList to the legalized pointer
10509 SDValue APStore =
10510 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
10511
10512 // Load the actual argument out of the pointer VAList
10513 if (NeedFPTrunc) {
10514 // Load the value as an f64.
10515 SDValue WideFP =
10516 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
10517 // Round the value down to an f32.
10518 SDValue NarrowFP =
10519 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
10520 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
10521 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
10522 // Merge the rounded value with the chain output of the load.
10523 return DAG.getMergeValues(Ops, DL);
10524 }
10525
10526 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
10527}
10528
10529SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
10530 SelectionDAG &DAG) const {
10532 MFI.setFrameAddressIsTaken(true);
10533
10534 EVT VT = Op.getValueType();
10535 SDLoc DL(Op);
10536 unsigned Depth = Op.getConstantOperandVal(0);
10537 SDValue FrameAddr =
10538 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
10539 while (Depth--)
10540 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
10542
10543 if (Subtarget->isTargetILP32())
10544 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
10545 DAG.getValueType(VT));
10546
10547 return FrameAddr;
10548}
10549
10550SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
10551 SelectionDAG &DAG) const {
10553
10554 EVT VT = getPointerTy(DAG.getDataLayout());
10555 SDLoc DL(Op);
10556 int FI = MFI.CreateFixedObject(4, 0, false);
10557 return DAG.getFrameIndex(FI, VT);
10558}
10559
10560#define GET_REGISTER_MATCHER
10561#include "AArch64GenAsmMatcher.inc"
10562
10563// FIXME? Maybe this could be a TableGen attribute on some registers and
10564// this table could be generated automatically from RegInfo.
10565Register AArch64TargetLowering::
10566getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
10568 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
10569 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
10570 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
10571 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
10572 !MRI->isReservedReg(MF, Reg))
10573 Reg = 0;
10574 }
10575 if (Reg)
10576 return Reg;
10577 report_fatal_error(Twine("Invalid register name \""
10578 + StringRef(RegName) + "\"."));
10579}
10580
10581SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
10582 SelectionDAG &DAG) const {
10584
10585 EVT VT = Op.getValueType();
10586 SDLoc DL(Op);
10587
10588 SDValue FrameAddr =
10589 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
10591
10592 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
10593}
10594
10595SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
10596 SelectionDAG &DAG) const {
10598 MachineFrameInfo &MFI = MF.getFrameInfo();
10599 MFI.setReturnAddressIsTaken(true);
10600
10601 EVT VT = Op.getValueType();
10602 SDLoc DL(Op);
10603 unsigned Depth = Op.getConstantOperandVal(0);
10604 SDValue ReturnAddress;
10605 if (Depth) {
10606 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
10608 ReturnAddress = DAG.getLoad(
10609 VT, DL, DAG.getEntryNode(),
10610 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
10611 } else {
10612 // Return LR, which contains the return address. Mark it an implicit
10613 // live-in.
10614 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
10615 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
10616 }
10617
10618 // The XPACLRI instruction assembles to a hint-space instruction before
10619 // Armv8.3-A therefore this instruction can be safely used for any pre
10620 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
10621 // that instead.
10622 SDNode *St;
10623 if (Subtarget->hasPAuth()) {
10624 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
10625 } else {
10626 // XPACLRI operates on LR therefore we must move the operand accordingly.
10627 SDValue Chain =
10628 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
10629 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
10630 }
10631 return SDValue(St, 0);
10632}
10633
10634/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
10635/// i32 values and take a 2 x i32 value to shift plus a shift amount.
10636SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
10637 SelectionDAG &DAG) const {
10638 SDValue Lo, Hi;
10639 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
10640 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
10641}
10642
10644 const GlobalAddressSDNode *GA) const {
10645 // Offsets are folded in the DAG combine rather than here so that we can
10646 // intelligently choose an offset based on the uses.
10647 return false;
10648}
10649
10651 bool OptForSize) const {
10652 bool IsLegal = false;
10653 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
10654 // 16-bit case when target has full fp16 support.
10655 // We encode bf16 bit patterns as if they were fp16. This results in very
10656 // strange looking assembly but should populate the register with appropriate
10657 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
10658 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
10659 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
10660 // FIXME: We should be able to handle f128 as well with a clever lowering.
10661 const APInt ImmInt = Imm.bitcastToAPInt();
10662 if (VT == MVT::f64)
10663 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
10664 else if (VT == MVT::f32)
10665 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
10666 else if (VT == MVT::f16 || VT == MVT::bf16)
10667 IsLegal =
10668 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
10669 Imm.isPosZero();
10670
10671 // If we can not materialize in immediate field for fmov, check if the
10672 // value can be encoded as the immediate operand of a logical instruction.
10673 // The immediate value will be created with either MOVZ, MOVN, or ORR.
10674 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
10675 // generate that fmov.
10676 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
10677 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
10678 // however the mov+fmov sequence is always better because of the reduced
10679 // cache pressure. The timings are still the same if you consider
10680 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
10681 // movw+movk is fused). So we limit up to 2 instrdduction at most.
10684 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
10685 IsLegal = Insn.size() <= Limit;
10686 }
10687
10688 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
10689 << " imm value: "; Imm.dump(););
10690 return IsLegal;
10691}
10692
10693//===----------------------------------------------------------------------===//
10694// AArch64 Optimization Hooks
10695//===----------------------------------------------------------------------===//
10696
10697static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
10698 SDValue Operand, SelectionDAG &DAG,
10699 int &ExtraSteps) {
10700 EVT VT = Operand.getValueType();
10701 if ((ST->hasNEON() &&
10702 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
10703 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
10704 VT == MVT::v4f32)) ||
10705 (ST->hasSVE() &&
10706 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
10708 // For the reciprocal estimates, convergence is quadratic, so the number
10709 // of digits is doubled after each iteration. In ARMv8, the accuracy of
10710 // the initial estimate is 2^-8. Thus the number of extra steps to refine
10711 // the result for float (23 mantissa bits) is 2 and for double (52
10712 // mantissa bits) is 3.
10713 constexpr unsigned AccurateBits = 8;
10714 unsigned DesiredBits =
10716 ExtraSteps = DesiredBits <= AccurateBits
10717 ? 0
10718 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
10719 }
10720
10721 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
10722 }
10723
10724 return SDValue();
10725}
10726
10727SDValue
10728AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
10729 const DenormalMode &Mode) const {
10730 SDLoc DL(Op);
10731 EVT VT = Op.getValueType();
10732 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
10733 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
10734 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
10735}
10736
10737SDValue
10738AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
10739 SelectionDAG &DAG) const {
10740 return Op;
10741}
10742
10743SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
10744 SelectionDAG &DAG, int Enabled,
10745 int &ExtraSteps,
10746 bool &UseOneConst,
10747 bool Reciprocal) const {
10749 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
10750 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
10751 DAG, ExtraSteps)) {
10752 SDLoc DL(Operand);
10753 EVT VT = Operand.getValueType();
10754
10756 Flags.setAllowReassociation(true);
10757
10758 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
10759 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
10760 for (int i = ExtraSteps; i > 0; --i) {
10761 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
10762 Flags);
10763 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
10764 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10765 }
10766 if (!Reciprocal)
10767 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
10768
10769 ExtraSteps = 0;
10770 return Estimate;
10771 }
10772
10773 return SDValue();
10774}
10775
10776SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
10777 SelectionDAG &DAG, int Enabled,
10778 int &ExtraSteps) const {
10780 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
10781 DAG, ExtraSteps)) {
10782 SDLoc DL(Operand);
10783 EVT VT = Operand.getValueType();
10784
10786 Flags.setAllowReassociation(true);
10787
10788 // Newton reciprocal iteration: E * (2 - X * E)
10789 // AArch64 reciprocal iteration instruction: (2 - M * N)
10790 for (int i = ExtraSteps; i > 0; --i) {
10791 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
10792 Estimate, Flags);
10793 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
10794 }
10795
10796 ExtraSteps = 0;
10797 return Estimate;
10798 }
10799
10800 return SDValue();
10801}
10802
10803//===----------------------------------------------------------------------===//
10804// AArch64 Inline Assembly Support
10805//===----------------------------------------------------------------------===//
10806
10807// Table of Constraints
10808// TODO: This is the current set of constraints supported by ARM for the
10809// compiler, not all of them may make sense.
10810//
10811// r - A general register
10812// w - An FP/SIMD register of some size in the range v0-v31
10813// x - An FP/SIMD register of some size in the range v0-v15
10814// I - Constant that can be used with an ADD instruction
10815// J - Constant that can be used with a SUB instruction
10816// K - Constant that can be used with a 32-bit logical instruction
10817// L - Constant that can be used with a 64-bit logical instruction
10818// M - Constant that can be used as a 32-bit MOV immediate
10819// N - Constant that can be used as a 64-bit MOV immediate
10820// Q - A memory reference with base register and no offset
10821// S - A symbolic address
10822// Y - Floating point constant zero
10823// Z - Integer constant zero
10824//
10825// Note that general register operands will be output using their 64-bit x
10826// register name, whatever the size of the variable, unless the asm operand
10827// is prefixed by the %w modifier. Floating-point and SIMD register operands
10828// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
10829// %q modifier.
10830const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
10831 // At this point, we have to lower this constraint to something else, so we
10832 // lower it to an "r" or "w". However, by doing this we will force the result
10833 // to be in register, while the X constraint is much more permissive.
10834 //
10835 // Although we are correct (we are free to emit anything, without
10836 // constraints), we might break use cases that would expect us to be more
10837 // efficient and emit something else.
10838 if (!Subtarget->hasFPARMv8())
10839 return "r";
10840
10841 if (ConstraintVT.isFloatingPoint())
10842 return "w";
10843
10844 if (ConstraintVT.isVector() &&
10845 (ConstraintVT.getSizeInBits() == 64 ||
10846 ConstraintVT.getSizeInBits() == 128))
10847 return "w";
10848
10849 return "r";
10850}
10851
10853
10854static std::optional<PredicateConstraint>
10857 .Case("Uph", PredicateConstraint::Uph)
10858 .Case("Upl", PredicateConstraint::Upl)
10859 .Case("Upa", PredicateConstraint::Upa)
10860 .Default(std::nullopt);
10861}
10862
10863static const TargetRegisterClass *
10865 if (VT != MVT::aarch64svcount &&
10866 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
10867 return nullptr;
10868
10869 switch (Constraint) {
10870 case PredicateConstraint::Uph:
10871 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
10872 : &AArch64::PPR_p8to15RegClass;
10873 case PredicateConstraint::Upl:
10874 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
10875 : &AArch64::PPR_3bRegClass;
10876 case PredicateConstraint::Upa:
10877 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
10878 : &AArch64::PPRRegClass;
10879 }
10880
10881 llvm_unreachable("Missing PredicateConstraint!");
10882}
10883
10885
10886static std::optional<ReducedGprConstraint>
10889 .Case("Uci", ReducedGprConstraint::Uci)
10890 .Case("Ucj", ReducedGprConstraint::Ucj)
10891 .Default(std::nullopt);
10892}
10893
10894static const TargetRegisterClass *
10896 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
10897 return nullptr;
10898
10899 switch (Constraint) {
10900 case ReducedGprConstraint::Uci:
10901 return &AArch64::MatrixIndexGPR32_8_11RegClass;
10902 case ReducedGprConstraint::Ucj:
10903 return &AArch64::MatrixIndexGPR32_12_15RegClass;
10904 }
10905
10906 llvm_unreachable("Missing ReducedGprConstraint!");
10907}
10908
10909// The set of cc code supported is from
10910// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
10913 .Case("{@cchi}", AArch64CC::HI)
10914 .Case("{@cccs}", AArch64CC::HS)
10915 .Case("{@cclo}", AArch64CC::LO)
10916 .Case("{@ccls}", AArch64CC::LS)
10917 .Case("{@cccc}", AArch64CC::LO)
10918 .Case("{@cceq}", AArch64CC::EQ)
10919 .Case("{@ccgt}", AArch64CC::GT)
10920 .Case("{@ccge}", AArch64CC::GE)
10921 .Case("{@cclt}", AArch64CC::LT)
10922 .Case("{@ccle}", AArch64CC::LE)
10923 .Case("{@cchs}", AArch64CC::HS)
10924 .Case("{@ccne}", AArch64CC::NE)
10925 .Case("{@ccvc}", AArch64CC::VC)
10926 .Case("{@ccpl}", AArch64CC::PL)
10927 .Case("{@ccvs}", AArch64CC::VS)
10928 .Case("{@ccmi}", AArch64CC::MI)
10930 return Cond;
10931}
10932
10933/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
10934/// WZR, invert(<cond>)'.
10936 SelectionDAG &DAG) {
10937 return DAG.getNode(
10938 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
10939 DAG.getConstant(0, DL, MVT::i32),
10940 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
10941}
10942
10943// Lower @cc flag output via getSETCC.
10944SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
10945 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
10946 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
10947 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
10948 if (Cond == AArch64CC::Invalid)
10949 return SDValue();
10950 // The output variable should be a scalar integer.
10951 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
10952 OpInfo.ConstraintVT.getSizeInBits() < 8)
10953 report_fatal_error("Flag output operand is of invalid type");
10954
10955 // Get NZCV register. Only update chain when copyfrom is glued.
10956 if (Glue.getNode()) {
10957 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
10958 Chain = Glue.getValue(1);
10959 } else
10960 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
10961 // Extract CC code.
10962 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
10963
10965
10966 // Truncate or ZERO_EXTEND based on value types.
10967 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
10968 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
10969 else
10970 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
10971
10972 return Result;
10973}
10974
10975/// getConstraintType - Given a constraint letter, return the type of
10976/// constraint it is for this target.
10978AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
10979 if (Constraint.size() == 1) {
10980 switch (Constraint[0]) {
10981 default:
10982 break;
10983 case 'x':
10984 case 'w':
10985 case 'y':
10986 return C_RegisterClass;
10987 // An address with a single base register. Due to the way we
10988 // currently handle addresses it is the same as 'r'.
10989 case 'Q':
10990 return C_Memory;
10991 case 'I':
10992 case 'J':
10993 case 'K':
10994 case 'L':
10995 case 'M':
10996 case 'N':
10997 case 'Y':
10998 case 'Z':
10999 return C_Immediate;
11000 case 'z':
11001 case 'S': // A symbol or label reference with a constant offset
11002 return C_Other;
11003 }
11004 } else if (parsePredicateConstraint(Constraint))
11005 return C_RegisterClass;
11006 else if (parseReducedGprConstraint(Constraint))
11007 return C_RegisterClass;
11008 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
11009 return C_Other;
11010 return TargetLowering::getConstraintType(Constraint);
11011}
11012
11013/// Examine constraint type and operand type and determine a weight value.
11014/// This object must already have been set up with the operand type
11015/// and the current alternative constraint selected.
11017AArch64TargetLowering::getSingleConstraintMatchWeight(
11018 AsmOperandInfo &info, const char *constraint) const {
11020 Value *CallOperandVal = info.CallOperandVal;
11021 // If we don't have a value, we can't do a match,
11022 // but allow it at the lowest weight.
11023 if (!CallOperandVal)
11024 return CW_Default;
11025 Type *type = CallOperandVal->getType();
11026 // Look at the constraint type.
11027 switch (*constraint) {
11028 default:
11030 break;
11031 case 'x':
11032 case 'w':
11033 case 'y':
11034 if (type->isFloatingPointTy() || type->isVectorTy())
11035 weight = CW_Register;
11036 break;
11037 case 'z':
11038 weight = CW_Constant;
11039 break;
11040 case 'U':
11041 if (parsePredicateConstraint(constraint) ||
11042 parseReducedGprConstraint(constraint))
11043 weight = CW_Register;
11044 break;
11045 }
11046 return weight;
11047}
11048
11049std::pair<unsigned, const TargetRegisterClass *>
11050AArch64TargetLowering::getRegForInlineAsmConstraint(
11051 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11052 if (Constraint.size() == 1) {
11053 switch (Constraint[0]) {
11054 case 'r':
11055 if (VT.isScalableVector())
11056 return std::make_pair(0U, nullptr);
11057 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
11058 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
11059 if (VT.getFixedSizeInBits() == 64)
11060 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
11061 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
11062 case 'w': {
11063 if (!Subtarget->hasFPARMv8())
11064 break;
11065 if (VT.isScalableVector()) {
11066 if (VT.getVectorElementType() != MVT::i1)
11067 return std::make_pair(0U, &AArch64::ZPRRegClass);
11068 return std::make_pair(0U, nullptr);
11069 }
11070 uint64_t VTSize = VT.getFixedSizeInBits();
11071 if (VTSize == 16)
11072 return std::make_pair(0U, &AArch64::FPR16RegClass);
11073 if (VTSize == 32)
11074 return std::make_pair(0U, &AArch64::FPR32RegClass);
11075 if (VTSize == 64)
11076 return std::make_pair(0U, &AArch64::FPR64RegClass);
11077 if (VTSize == 128)
11078 return std::make_pair(0U, &AArch64::FPR128RegClass);
11079 break;
11080 }
11081 // The instructions that this constraint is designed for can
11082 // only take 128-bit registers so just use that regclass.
11083 case 'x':
11084 if (!Subtarget->hasFPARMv8())
11085 break;
11086 if (VT.isScalableVector())
11087 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11088 if (VT.getSizeInBits() == 128)
11089 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11090 break;
11091 case 'y':
11092 if (!Subtarget->hasFPARMv8())
11093 break;
11094 if (VT.isScalableVector())
11095 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11096 break;
11097 }
11098 } else {
11099 if (const auto PC = parsePredicateConstraint(Constraint))
11100 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11101 return std::make_pair(0U, RegClass);
11102
11103 if (const auto RGC = parseReducedGprConstraint(Constraint))
11104 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11105 return std::make_pair(0U, RegClass);
11106 }
11107 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11109 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11110
11111 if (Constraint == "{za}") {
11112 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11113 }
11114
11115 if (Constraint == "{zt0}") {
11116 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11117 }
11118
11119 // Use the default implementation in TargetLowering to convert the register
11120 // constraint into a member of a register class.
11121 std::pair<unsigned, const TargetRegisterClass *> Res;
11123
11124 // Not found as a standard register?
11125 if (!Res.second) {
11126 unsigned Size = Constraint.size();
11127 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11128 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11129 int RegNo;
11130 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11131 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11132 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11133 // By default we'll emit v0-v31 for this unless there's a modifier where
11134 // we'll emit the correct register as well.
11135 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11136 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11137 Res.second = &AArch64::FPR64RegClass;
11138 } else {
11139 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11140 Res.second = &AArch64::FPR128RegClass;
11141 }
11142 }
11143 }
11144 }
11145
11146 if (Res.second && !Subtarget->hasFPARMv8() &&
11147 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11148 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11149 return std::make_pair(0U, nullptr);
11150
11151 return Res;
11152}
11153
11155 llvm::Type *Ty,
11156 bool AllowUnknown) const {
11157 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11158 return EVT(MVT::i64x8);
11159
11160 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11161}
11162
11163/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11164/// vector. If it is invalid, don't add anything to Ops.
11165void AArch64TargetLowering::LowerAsmOperandForConstraint(
11166 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11167 SelectionDAG &DAG) const {
11168 SDValue Result;
11169
11170 // Currently only support length 1 constraints.
11171 if (Constraint.size() != 1)
11172 return;
11173
11174 char ConstraintLetter = Constraint[0];
11175 switch (ConstraintLetter) {
11176 default:
11177 break;
11178
11179 // This set of constraints deal with valid constants for various instructions.
11180 // Validate and return a target constant for them if we can.
11181 case 'z': {
11182 // 'z' maps to xzr or wzr so it needs an input of 0.
11183 if (!isNullConstant(Op))
11184 return;
11185
11186 if (Op.getValueType() == MVT::i64)
11187 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11188 else
11189 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11190 break;
11191 }
11192 case 'S':
11193 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11194 // supported for PIC while "s" isn't, making "s" less useful. We implement
11195 // "S" but not "s".
11197 break;
11198
11199 case 'I':
11200 case 'J':
11201 case 'K':
11202 case 'L':
11203 case 'M':
11204 case 'N':
11205 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11206 if (!C)
11207 return;
11208
11209 // Grab the value and do some validation.
11210 uint64_t CVal = C->getZExtValue();
11211 switch (ConstraintLetter) {
11212 // The I constraint applies only to simple ADD or SUB immediate operands:
11213 // i.e. 0 to 4095 with optional shift by 12
11214 // The J constraint applies only to ADD or SUB immediates that would be
11215 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11216 // instruction [or vice versa], in other words -1 to -4095 with optional
11217 // left shift by 12.
11218 case 'I':
11219 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11220 break;
11221 return;
11222 case 'J': {
11223 uint64_t NVal = -C->getSExtValue();
11224 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11225 CVal = C->getSExtValue();
11226 break;
11227 }
11228 return;
11229 }
11230 // The K and L constraints apply *only* to logical immediates, including
11231 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11232 // been removed and MOV should be used). So these constraints have to
11233 // distinguish between bit patterns that are valid 32-bit or 64-bit
11234 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11235 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11236 // versa.
11237 case 'K':
11238 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11239 break;
11240 return;
11241 case 'L':
11242 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11243 break;
11244 return;
11245 // The M and N constraints are a superset of K and L respectively, for use
11246 // with the MOV (immediate) alias. As well as the logical immediates they
11247 // also match 32 or 64-bit immediates that can be loaded either using a
11248 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11249 // (M) or 64-bit 0x1234000000000000 (N) etc.
11250 // As a note some of this code is liberally stolen from the asm parser.
11251 case 'M': {
11252 if (!isUInt<32>(CVal))
11253 return;
11254 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11255 break;
11256 if ((CVal & 0xFFFF) == CVal)
11257 break;
11258 if ((CVal & 0xFFFF0000ULL) == CVal)
11259 break;
11260 uint64_t NCVal = ~(uint32_t)CVal;
11261 if ((NCVal & 0xFFFFULL) == NCVal)
11262 break;
11263 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11264 break;
11265 return;
11266 }
11267 case 'N': {
11268 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11269 break;
11270 if ((CVal & 0xFFFFULL) == CVal)
11271 break;
11272 if ((CVal & 0xFFFF0000ULL) == CVal)
11273 break;
11274 if ((CVal & 0xFFFF00000000ULL) == CVal)
11275 break;
11276 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11277 break;
11278 uint64_t NCVal = ~CVal;
11279 if ((NCVal & 0xFFFFULL) == NCVal)
11280 break;
11281 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11282 break;
11283 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11284 break;
11285 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11286 break;
11287 return;
11288 }
11289 default:
11290 return;
11291 }
11292
11293 // All assembler immediates are 64-bit integers.
11294 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11295 break;
11296 }
11297
11298 if (Result.getNode()) {
11299 Ops.push_back(Result);
11300 return;
11301 }
11302
11303 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11304}
11305
11306//===----------------------------------------------------------------------===//
11307// AArch64 Advanced SIMD Support
11308//===----------------------------------------------------------------------===//
11309
11310/// WidenVector - Given a value in the V64 register class, produce the
11311/// equivalent value in the V128 register class.
11313 EVT VT = V64Reg.getValueType();
11314 unsigned NarrowSize = VT.getVectorNumElements();
11315 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11316 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11317 SDLoc DL(V64Reg);
11318
11319 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11320 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11321}
11322
11323/// getExtFactor - Determine the adjustment factor for the position when
11324/// generating an "extract from vector registers" instruction.
11325static unsigned getExtFactor(SDValue &V) {
11326 EVT EltType = V.getValueType().getVectorElementType();
11327 return EltType.getSizeInBits() / 8;
11328}
11329
11330// Check if a vector is built from one vector via extracted elements of
11331// another together with an AND mask, ensuring that all elements fit
11332// within range. This can be reconstructed using AND and NEON's TBL1.
11334 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11335 SDLoc dl(Op);
11336 EVT VT = Op.getValueType();
11337 assert(!VT.isScalableVector() &&
11338 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11339
11340 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11341 // directly to TBL1.
11342 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11343 return SDValue();
11344
11345 unsigned NumElts = VT.getVectorNumElements();
11346 assert((NumElts == 8 || NumElts == 16) &&
11347 "Need to have exactly 8 or 16 elements in vector.");
11348
11349 SDValue SourceVec;
11350 SDValue MaskSourceVec;
11351 SmallVector<SDValue, 16> AndMaskConstants;
11352
11353 for (unsigned i = 0; i < NumElts; ++i) {
11354 SDValue V = Op.getOperand(i);
11355 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11356 return SDValue();
11357
11358 SDValue OperandSourceVec = V.getOperand(0);
11359 if (!SourceVec)
11360 SourceVec = OperandSourceVec;
11361 else if (SourceVec != OperandSourceVec)
11362 return SDValue();
11363
11364 // This only looks at shuffles with elements that are
11365 // a) truncated by a constant AND mask extracted from a mask vector, or
11366 // b) extracted directly from a mask vector.
11367 SDValue MaskSource = V.getOperand(1);
11368 if (MaskSource.getOpcode() == ISD::AND) {
11369 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11370 return SDValue();
11371
11372 AndMaskConstants.push_back(MaskSource.getOperand(1));
11373 MaskSource = MaskSource->getOperand(0);
11374 } else if (!AndMaskConstants.empty()) {
11375 // Either all or no operands should have an AND mask.
11376 return SDValue();
11377 }
11378
11379 // An ANY_EXTEND may be inserted between the AND and the source vector
11380 // extraction. We don't care about that, so we can just skip it.
11381 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11382 MaskSource = MaskSource.getOperand(0);
11383
11384 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11385 return SDValue();
11386
11387 SDValue MaskIdx = MaskSource.getOperand(1);
11388 if (!isa<ConstantSDNode>(MaskIdx) ||
11389 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
11390 return SDValue();
11391
11392 // We only apply this if all elements come from the same vector with the
11393 // same vector type.
11394 if (!MaskSourceVec) {
11395 MaskSourceVec = MaskSource->getOperand(0);
11396 if (MaskSourceVec.getValueType() != VT)
11397 return SDValue();
11398 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
11399 return SDValue();
11400 }
11401 }
11402
11403 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
11404 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
11405 // insert, we know that the index in the mask must be smaller than the number
11406 // of elements in the source, or we would have an out-of-bounds access.
11407 if (NumElts == 8)
11408 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
11409 DAG.getUNDEF(VT));
11410
11411 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
11412 if (!AndMaskConstants.empty())
11413 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
11414 DAG.getBuildVector(VT, dl, AndMaskConstants));
11415
11416 return DAG.getNode(
11418 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
11419 MaskSourceVec);
11420}
11421
11422// Gather data to see if the operation can be modelled as a
11423// shuffle in combination with VEXTs.
11425 SelectionDAG &DAG) const {
11426 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11427 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
11428 SDLoc dl(Op);
11429 EVT VT = Op.getValueType();
11430 assert(!VT.isScalableVector() &&
11431 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11432 unsigned NumElts = VT.getVectorNumElements();
11433
11434 struct ShuffleSourceInfo {
11435 SDValue Vec;
11436 unsigned MinElt;
11437 unsigned MaxElt;
11438
11439 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
11440 // be compatible with the shuffle we intend to construct. As a result
11441 // ShuffleVec will be some sliding window into the original Vec.
11442 SDValue ShuffleVec;
11443
11444 // Code should guarantee that element i in Vec starts at element "WindowBase
11445 // + i * WindowScale in ShuffleVec".
11446 int WindowBase;
11447 int WindowScale;
11448
11449 ShuffleSourceInfo(SDValue Vec)
11450 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
11451 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
11452
11453 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
11454 };
11455
11456 // First gather all vectors used as an immediate source for this BUILD_VECTOR
11457 // node.
11459 for (unsigned i = 0; i < NumElts; ++i) {
11460 SDValue V = Op.getOperand(i);
11461 if (V.isUndef())
11462 continue;
11463 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11464 !isa<ConstantSDNode>(V.getOperand(1)) ||
11465 V.getOperand(0).getValueType().isScalableVector()) {
11466 LLVM_DEBUG(
11467 dbgs() << "Reshuffle failed: "
11468 "a shuffle can only come from building a vector from "
11469 "various elements of other fixed-width vectors, provided "
11470 "their indices are constant\n");
11471 return SDValue();
11472 }
11473
11474 // Add this element source to the list if it's not already there.
11475 SDValue SourceVec = V.getOperand(0);
11476 auto Source = find(Sources, SourceVec);
11477 if (Source == Sources.end())
11478 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
11479
11480 // Update the minimum and maximum lane number seen.
11481 unsigned EltNo = V.getConstantOperandVal(1);
11482 Source->MinElt = std::min(Source->MinElt, EltNo);
11483 Source->MaxElt = std::max(Source->MaxElt, EltNo);
11484 }
11485
11486 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
11487 // better than moving to/from gpr registers for larger vectors.
11488 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
11489 // Construct a mask for the tbl. We may need to adjust the index for types
11490 // larger than i8.
11492 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
11493 for (unsigned I = 0; I < NumElts; ++I) {
11494 SDValue V = Op.getOperand(I);
11495 if (V.isUndef()) {
11496 for (unsigned OF = 0; OF < OutputFactor; OF++)
11497 Mask.push_back(-1);
11498 continue;
11499 }
11500 // Set the Mask lanes adjusted for the size of the input and output
11501 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
11502 // output element, adjusted in their positions per input and output types.
11503 unsigned Lane = V.getConstantOperandVal(1);
11504 for (unsigned S = 0; S < Sources.size(); S++) {
11505 if (V.getOperand(0) == Sources[S].Vec) {
11506 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
11507 unsigned InputBase = 16 * S + Lane * InputSize / 8;
11508 for (unsigned OF = 0; OF < OutputFactor; OF++)
11509 Mask.push_back(InputBase + OF);
11510 break;
11511 }
11512 }
11513 }
11514
11515 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
11516 // v16i8, and the TBLMask
11517 SmallVector<SDValue, 16> TBLOperands;
11518 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
11519 ? Intrinsic::aarch64_neon_tbl3
11520 : Intrinsic::aarch64_neon_tbl4,
11521 dl, MVT::i32));
11522 for (unsigned i = 0; i < Sources.size(); i++) {
11523 SDValue Src = Sources[i].Vec;
11524 EVT SrcVT = Src.getValueType();
11525 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
11526 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
11527 "Expected a legally typed vector");
11528 if (SrcVT.is64BitVector())
11529 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
11530 DAG.getUNDEF(MVT::v8i8));
11531 TBLOperands.push_back(Src);
11532 }
11533
11535 for (unsigned i = 0; i < Mask.size(); i++)
11536 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
11537 assert((Mask.size() == 8 || Mask.size() == 16) &&
11538 "Expected a v8i8 or v16i8 Mask");
11539 TBLOperands.push_back(
11540 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
11541
11542 SDValue Shuffle =
11544 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
11545 return DAG.getBitcast(VT, Shuffle);
11546 }
11547
11548 if (Sources.size() > 2) {
11549 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
11550 << "sensible when at most two source vectors are "
11551 << "involved\n");
11552 return SDValue();
11553 }
11554
11555 // Find out the smallest element size among result and two sources, and use
11556 // it as element size to build the shuffle_vector.
11557 EVT SmallestEltTy = VT.getVectorElementType();
11558 for (auto &Source : Sources) {
11559 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
11560 if (SrcEltTy.bitsLT(SmallestEltTy)) {
11561 SmallestEltTy = SrcEltTy;
11562 }
11563 }
11564 unsigned ResMultiplier =
11565 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11566 uint64_t VTSize = VT.getFixedSizeInBits();
11567 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
11568 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
11569
11570 // If the source vector is too wide or too narrow, we may nevertheless be able
11571 // to construct a compatible shuffle either by concatenating it with UNDEF or
11572 // extracting a suitable range of elements.
11573 for (auto &Src : Sources) {
11574 EVT SrcVT = Src.ShuffleVec.getValueType();
11575
11576 TypeSize SrcVTSize = SrcVT.getSizeInBits();
11577 if (SrcVTSize == TypeSize::getFixed(VTSize))
11578 continue;
11579
11580 // This stage of the search produces a source with the same element type as
11581 // the original, but with a total width matching the BUILD_VECTOR output.
11582 EVT EltVT = SrcVT.getVectorElementType();
11583 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
11584 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
11585
11586 if (SrcVTSize.getFixedValue() < VTSize) {
11587 assert(2 * SrcVTSize == VTSize);
11588 // We can pad out the smaller vector for free, so if it's part of a
11589 // shuffle...
11590 Src.ShuffleVec =
11591 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
11592 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
11593 continue;
11594 }
11595
11596 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
11597 LLVM_DEBUG(
11598 dbgs() << "Reshuffle failed: result vector too small to extract\n");
11599 return SDValue();
11600 }
11601
11602 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
11603 LLVM_DEBUG(
11604 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
11605 return SDValue();
11606 }
11607
11608 if (Src.MinElt >= NumSrcElts) {
11609 // The extraction can just take the second half
11610 Src.ShuffleVec =
11611 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11612 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11613 Src.WindowBase = -NumSrcElts;
11614 } else if (Src.MaxElt < NumSrcElts) {
11615 // The extraction can just take the first half
11616 Src.ShuffleVec =
11617 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11618 DAG.getConstant(0, dl, MVT::i64));
11619 } else {
11620 // An actual VEXT is needed
11621 SDValue VEXTSrc1 =
11622 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11623 DAG.getConstant(0, dl, MVT::i64));
11624 SDValue VEXTSrc2 =
11625 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
11626 DAG.getConstant(NumSrcElts, dl, MVT::i64));
11627 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
11628
11629 if (!SrcVT.is64BitVector()) {
11630 LLVM_DEBUG(
11631 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
11632 "for SVE vectors.");
11633 return SDValue();
11634 }
11635
11636 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
11637 VEXTSrc2,
11638 DAG.getConstant(Imm, dl, MVT::i32));
11639 Src.WindowBase = -Src.MinElt;
11640 }
11641 }
11642
11643 // Another possible incompatibility occurs from the vector element types. We
11644 // can fix this by bitcasting the source vectors to the same type we intend
11645 // for the shuffle.
11646 for (auto &Src : Sources) {
11647 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
11648 if (SrcEltTy == SmallestEltTy)
11649 continue;
11650 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
11651 if (DAG.getDataLayout().isBigEndian()) {
11652 Src.ShuffleVec =
11653 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
11654 } else {
11655 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
11656 }
11657 Src.WindowScale =
11658 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
11659 Src.WindowBase *= Src.WindowScale;
11660 }
11661
11662 // Final check before we try to actually produce a shuffle.
11663 LLVM_DEBUG(for (auto Src
11664 : Sources)
11665 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
11666
11667 // The stars all align, our next step is to produce the mask for the shuffle.
11668 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
11669 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
11670 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
11671 SDValue Entry = Op.getOperand(i);
11672 if (Entry.isUndef())
11673 continue;
11674
11675 auto Src = find(Sources, Entry.getOperand(0));
11676 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
11677
11678 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
11679 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
11680 // segment.
11681 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
11682 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
11683 VT.getScalarSizeInBits());
11684 int LanesDefined = BitsDefined / BitsPerShuffleLane;
11685
11686 // This source is expected to fill ResMultiplier lanes of the final shuffle,
11687 // starting at the appropriate offset.
11688 int *LaneMask = &Mask[i * ResMultiplier];
11689
11690 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
11691 ExtractBase += NumElts * (Src - Sources.begin());
11692 for (int j = 0; j < LanesDefined; ++j)
11693 LaneMask[j] = ExtractBase + j;
11694 }
11695
11696 // Final check before we try to produce nonsense...
11697 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
11698 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
11699 return SDValue();
11700 }
11701
11702 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
11703 for (unsigned i = 0; i < Sources.size(); ++i)
11704 ShuffleOps[i] = Sources[i].ShuffleVec;
11705
11706 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
11707 ShuffleOps[1], Mask);
11708 SDValue V;
11709 if (DAG.getDataLayout().isBigEndian()) {
11710 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
11711 } else {
11712 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
11713 }
11714
11715 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
11716 dbgs() << "Reshuffle, creating node: "; V.dump(););
11717
11718 return V;
11719}
11720
11721// check if an EXT instruction can handle the shuffle mask when the
11722// vector sources of the shuffle are the same.
11723static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
11724 unsigned NumElts = VT.getVectorNumElements();
11725
11726 // Assume that the first shuffle index is not UNDEF. Fail if it is.
11727 if (M[0] < 0)
11728 return false;
11729
11730 Imm = M[0];
11731
11732 // If this is a VEXT shuffle, the immediate value is the index of the first
11733 // element. The other shuffle indices must be the successive elements after
11734 // the first one.
11735 unsigned ExpectedElt = Imm;
11736 for (unsigned i = 1; i < NumElts; ++i) {
11737 // Increment the expected index. If it wraps around, just follow it
11738 // back to index zero and keep going.
11739 ++ExpectedElt;
11740 if (ExpectedElt == NumElts)
11741 ExpectedElt = 0;
11742
11743 if (M[i] < 0)
11744 continue; // ignore UNDEF indices
11745 if (ExpectedElt != static_cast<unsigned>(M[i]))
11746 return false;
11747 }
11748
11749 return true;
11750}
11751
11752// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11753// v4i32s. This is really a truncate, which we can construct out of (legal)
11754// concats and truncate nodes.
11756 if (V.getValueType() != MVT::v16i8)
11757 return SDValue();
11758 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
11759
11760 for (unsigned X = 0; X < 4; X++) {
11761 // Check the first item in each group is an extract from lane 0 of a v4i32
11762 // or v4i16.
11763 SDValue BaseExt = V.getOperand(X * 4);
11764 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11765 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
11766 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
11767 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
11768 BaseExt.getConstantOperandVal(1) != 0)
11769 return SDValue();
11770 SDValue Base = BaseExt.getOperand(0);
11771 // And check the other items are extracts from the same vector.
11772 for (unsigned Y = 1; Y < 4; Y++) {
11773 SDValue Ext = V.getOperand(X * 4 + Y);
11774 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11775 Ext.getOperand(0) != Base ||
11776 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
11777 Ext.getConstantOperandVal(1) != Y)
11778 return SDValue();
11779 }
11780 }
11781
11782 // Turn the buildvector into a series of truncates and concates, which will
11783 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
11784 // concat together to produce 2 v8i16. These are both truncated and concat
11785 // together.
11786 SDLoc DL(V);
11787 SDValue Trunc[4] = {
11788 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
11789 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
11790 for (SDValue &V : Trunc)
11791 if (V.getValueType() == MVT::v4i32)
11792 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
11793 SDValue Concat0 =
11794 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
11795 SDValue Concat1 =
11796 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
11797 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
11798 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
11799 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
11800}
11801
11802/// Check if a vector shuffle corresponds to a DUP instructions with a larger
11803/// element width than the vector lane type. If that is the case the function
11804/// returns true and writes the value of the DUP instruction lane operand into
11805/// DupLaneOp
11806static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
11807 unsigned &DupLaneOp) {
11808 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
11809 "Only possible block sizes for wide DUP are: 16, 32, 64");
11810
11811 if (BlockSize <= VT.getScalarSizeInBits())
11812 return false;
11813 if (BlockSize % VT.getScalarSizeInBits() != 0)
11814 return false;
11815 if (VT.getSizeInBits() % BlockSize != 0)
11816 return false;
11817
11818 size_t SingleVecNumElements = VT.getVectorNumElements();
11819 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
11820 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
11821
11822 // We are looking for masks like
11823 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
11824 // might be replaced by 'undefined'. BlockIndices will eventually contain
11825 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
11826 // for the above examples)
11827 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
11828 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
11829 for (size_t I = 0; I < NumEltsPerBlock; I++) {
11830 int Elt = M[BlockIndex * NumEltsPerBlock + I];
11831 if (Elt < 0)
11832 continue;
11833 // For now we don't support shuffles that use the second operand
11834 if ((unsigned)Elt >= SingleVecNumElements)
11835 return false;
11836 if (BlockElts[I] < 0)
11837 BlockElts[I] = Elt;
11838 else if (BlockElts[I] != Elt)
11839 return false;
11840 }
11841
11842 // We found a candidate block (possibly with some undefs). It must be a
11843 // sequence of consecutive integers starting with a value divisible by
11844 // NumEltsPerBlock with some values possibly replaced by undef-s.
11845
11846 // Find first non-undef element
11847 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
11848 assert(FirstRealEltIter != BlockElts.end() &&
11849 "Shuffle with all-undefs must have been caught by previous cases, "
11850 "e.g. isSplat()");
11851 if (FirstRealEltIter == BlockElts.end()) {
11852 DupLaneOp = 0;
11853 return true;
11854 }
11855
11856 // Index of FirstRealElt in BlockElts
11857 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
11858
11859 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
11860 return false;
11861 // BlockElts[0] must have the following value if it isn't undef:
11862 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
11863
11864 // Check the first element
11865 if (Elt0 % NumEltsPerBlock != 0)
11866 return false;
11867 // Check that the sequence indeed consists of consecutive integers (modulo
11868 // undefs)
11869 for (size_t I = 0; I < NumEltsPerBlock; I++)
11870 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
11871 return false;
11872
11873 DupLaneOp = Elt0 / NumEltsPerBlock;
11874 return true;
11875}
11876
11877// check if an EXT instruction can handle the shuffle mask when the
11878// vector sources of the shuffle are different.
11879static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
11880 unsigned &Imm) {
11881 // Look for the first non-undef element.
11882 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
11883
11884 // Benefit form APInt to handle overflow when calculating expected element.
11885 unsigned NumElts = VT.getVectorNumElements();
11886 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
11887 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
11888 // The following shuffle indices must be the successive elements after the
11889 // first real element.
11890 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
11891 return Elt != ExpectedElt++ && Elt != -1;
11892 });
11893 if (FoundWrongElt)
11894 return false;
11895
11896 // The index of an EXT is the first element if it is not UNDEF.
11897 // Watch out for the beginning UNDEFs. The EXT index should be the expected
11898 // value of the first element. E.g.
11899 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
11900 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
11901 // ExpectedElt is the last mask index plus 1.
11902 Imm = ExpectedElt.getZExtValue();
11903
11904 // There are two difference cases requiring to reverse input vectors.
11905 // For example, for vector <4 x i32> we have the following cases,
11906 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
11907 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
11908 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
11909 // to reverse two input vectors.
11910 if (Imm < NumElts)
11911 ReverseEXT = true;
11912 else
11913 Imm -= NumElts;
11914
11915 return true;
11916}
11917
11918/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
11919/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11920/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
11921static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11922 unsigned NumElts = VT.getVectorNumElements();
11923 if (NumElts % 2 != 0)
11924 return false;
11925 WhichResult = (M[0] == 0 ? 0 : 1);
11926 unsigned Idx = WhichResult * NumElts / 2;
11927 for (unsigned i = 0; i != NumElts; i += 2) {
11928 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
11929 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
11930 return false;
11931 Idx += 1;
11932 }
11933
11934 return true;
11935}
11936
11937/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
11938/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11939/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
11940static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11941 unsigned Half = VT.getVectorNumElements() / 2;
11942 WhichResult = (M[0] == 0 ? 0 : 1);
11943 for (unsigned j = 0; j != 2; ++j) {
11944 unsigned Idx = WhichResult;
11945 for (unsigned i = 0; i != Half; ++i) {
11946 int MIdx = M[i + j * Half];
11947 if (MIdx >= 0 && (unsigned)MIdx != Idx)
11948 return false;
11949 Idx += 2;
11950 }
11951 }
11952
11953 return true;
11954}
11955
11956/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
11957/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
11958/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
11959static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
11960 unsigned NumElts = VT.getVectorNumElements();
11961 if (NumElts % 2 != 0)
11962 return false;
11963 WhichResult = (M[0] == 0 ? 0 : 1);
11964 for (unsigned i = 0; i < NumElts; i += 2) {
11965 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
11966 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
11967 return false;
11968 }
11969 return true;
11970}
11971
11972static bool isINSMask(ArrayRef<int> M, int NumInputElements,
11973 bool &DstIsLeft, int &Anomaly) {
11974 if (M.size() != static_cast<size_t>(NumInputElements))
11975 return false;
11976
11977 int NumLHSMatch = 0, NumRHSMatch = 0;
11978 int LastLHSMismatch = -1, LastRHSMismatch = -1;
11979
11980 for (int i = 0; i < NumInputElements; ++i) {
11981 if (M[i] == -1) {
11982 ++NumLHSMatch;
11983 ++NumRHSMatch;
11984 continue;
11985 }
11986
11987 if (M[i] == i)
11988 ++NumLHSMatch;
11989 else
11990 LastLHSMismatch = i;
11991
11992 if (M[i] == i + NumInputElements)
11993 ++NumRHSMatch;
11994 else
11995 LastRHSMismatch = i;
11996 }
11997
11998 if (NumLHSMatch == NumInputElements - 1) {
11999 DstIsLeft = true;
12000 Anomaly = LastLHSMismatch;
12001 return true;
12002 } else if (NumRHSMatch == NumInputElements - 1) {
12003 DstIsLeft = false;
12004 Anomaly = LastRHSMismatch;
12005 return true;
12006 }
12007
12008 return false;
12009}
12010
12011static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12012 if (VT.getSizeInBits() != 128)
12013 return false;
12014
12015 unsigned NumElts = VT.getVectorNumElements();
12016
12017 for (int I = 0, E = NumElts / 2; I != E; I++) {
12018 if (Mask[I] != I)
12019 return false;
12020 }
12021
12022 int Offset = NumElts / 2;
12023 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12024 if (Mask[I] != I + SplitLHS * Offset)
12025 return false;
12026 }
12027
12028 return true;
12029}
12030
12032 SDLoc DL(Op);
12033 EVT VT = Op.getValueType();
12034 SDValue V0 = Op.getOperand(0);
12035 SDValue V1 = Op.getOperand(1);
12036 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12037
12040 return SDValue();
12041
12042 bool SplitV0 = V0.getValueSizeInBits() == 128;
12043
12044 if (!isConcatMask(Mask, VT, SplitV0))
12045 return SDValue();
12046
12047 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12048 if (SplitV0) {
12049 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12050 DAG.getConstant(0, DL, MVT::i64));
12051 }
12052 if (V1.getValueSizeInBits() == 128) {
12053 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12054 DAG.getConstant(0, DL, MVT::i64));
12055 }
12056 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12057}
12058
12059/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12060/// the specified operations to build the shuffle. ID is the perfect-shuffle
12061//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12062//table entry and LHS/RHS are the immediate inputs for this stage of the
12063//shuffle.
12065 SDValue V2, unsigned PFEntry, SDValue LHS,
12066 SDValue RHS, SelectionDAG &DAG,
12067 const SDLoc &dl) {
12068 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12069 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12070 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12071
12072 enum {
12073 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12074 OP_VREV,
12075 OP_VDUP0,
12076 OP_VDUP1,
12077 OP_VDUP2,
12078 OP_VDUP3,
12079 OP_VEXT1,
12080 OP_VEXT2,
12081 OP_VEXT3,
12082 OP_VUZPL, // VUZP, left result
12083 OP_VUZPR, // VUZP, right result
12084 OP_VZIPL, // VZIP, left result
12085 OP_VZIPR, // VZIP, right result
12086 OP_VTRNL, // VTRN, left result
12087 OP_VTRNR, // VTRN, right result
12088 OP_MOVLANE // Move lane. RHSID is the lane to move into
12089 };
12090
12091 if (OpNum == OP_COPY) {
12092 if (LHSID == (1 * 9 + 2) * 9 + 3)
12093 return LHS;
12094 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12095 return RHS;
12096 }
12097
12098 if (OpNum == OP_MOVLANE) {
12099 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12100 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12101 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12102 Elt = 3 - Elt;
12103 while (Elt > 0) {
12104 ID /= 9;
12105 Elt--;
12106 }
12107 return (ID % 9 == 8) ? -1 : ID % 9;
12108 };
12109
12110 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12111 // get the lane to move from the PFID, which is always from the
12112 // original vectors (V1 or V2).
12114 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12115 EVT VT = OpLHS.getValueType();
12116 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12117 unsigned ExtLane = 0;
12118 SDValue Input;
12119
12120 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12121 // convert into a higher type.
12122 if (RHSID & 0x4) {
12123 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12124 if (MaskElt == -1)
12125 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12126 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12127 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12128 Input = MaskElt < 2 ? V1 : V2;
12129 if (VT.getScalarSizeInBits() == 16) {
12130 Input = DAG.getBitcast(MVT::v2f32, Input);
12131 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12132 } else {
12133 assert(VT.getScalarSizeInBits() == 32 &&
12134 "Expected 16 or 32 bit shuffle elemements");
12135 Input = DAG.getBitcast(MVT::v2f64, Input);
12136 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12137 }
12138 } else {
12139 int MaskElt = getPFIDLane(ID, RHSID);
12140 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12141 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12142 Input = MaskElt < 4 ? V1 : V2;
12143 // Be careful about creating illegal types. Use f16 instead of i16.
12144 if (VT == MVT::v4i16) {
12145 Input = DAG.getBitcast(MVT::v4f16, Input);
12146 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12147 }
12148 }
12151 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12152 SDValue Ins =
12153 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12154 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12155 return DAG.getBitcast(VT, Ins);
12156 }
12157
12158 SDValue OpLHS, OpRHS;
12159 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12160 RHS, DAG, dl);
12161 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12162 RHS, DAG, dl);
12163 EVT VT = OpLHS.getValueType();
12164
12165 switch (OpNum) {
12166 default:
12167 llvm_unreachable("Unknown shuffle opcode!");
12168 case OP_VREV:
12169 // VREV divides the vector in half and swaps within the half.
12170 if (VT.getVectorElementType() == MVT::i32 ||
12171 VT.getVectorElementType() == MVT::f32)
12172 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12173 // vrev <4 x i16> -> REV32
12174 if (VT.getVectorElementType() == MVT::i16 ||
12175 VT.getVectorElementType() == MVT::f16 ||
12176 VT.getVectorElementType() == MVT::bf16)
12177 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12178 // vrev <4 x i8> -> REV16
12179 assert(VT.getVectorElementType() == MVT::i8);
12180 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12181 case OP_VDUP0:
12182 case OP_VDUP1:
12183 case OP_VDUP2:
12184 case OP_VDUP3: {
12185 EVT EltTy = VT.getVectorElementType();
12186 unsigned Opcode;
12187 if (EltTy == MVT::i8)
12188 Opcode = AArch64ISD::DUPLANE8;
12189 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12190 Opcode = AArch64ISD::DUPLANE16;
12191 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12192 Opcode = AArch64ISD::DUPLANE32;
12193 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12194 Opcode = AArch64ISD::DUPLANE64;
12195 else
12196 llvm_unreachable("Invalid vector element type?");
12197
12198 if (VT.getSizeInBits() == 64)
12199 OpLHS = WidenVector(OpLHS, DAG);
12200 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12201 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12202 }
12203 case OP_VEXT1:
12204 case OP_VEXT2:
12205 case OP_VEXT3: {
12206 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12207 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12208 DAG.getConstant(Imm, dl, MVT::i32));
12209 }
12210 case OP_VUZPL:
12211 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12212 case OP_VUZPR:
12213 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12214 case OP_VZIPL:
12215 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12216 case OP_VZIPR:
12217 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12218 case OP_VTRNL:
12219 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12220 case OP_VTRNR:
12221 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12222 }
12223}
12224
12226 SelectionDAG &DAG) {
12227 // Check to see if we can use the TBL instruction.
12228 SDValue V1 = Op.getOperand(0);
12229 SDValue V2 = Op.getOperand(1);
12230 SDLoc DL(Op);
12231
12232 EVT EltVT = Op.getValueType().getVectorElementType();
12233 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12234
12235 bool Swap = false;
12236 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12237 std::swap(V1, V2);
12238 Swap = true;
12239 }
12240
12241 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12242 // out of range values with 0s. We do need to make sure that any out-of-range
12243 // values are really out-of-range for a v16i8 vector.
12244 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12245 MVT IndexVT = MVT::v8i8;
12246 unsigned IndexLen = 8;
12247 if (Op.getValueSizeInBits() == 128) {
12248 IndexVT = MVT::v16i8;
12249 IndexLen = 16;
12250 }
12251
12253 for (int Val : ShuffleMask) {
12254 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12255 unsigned Offset = Byte + Val * BytesPerElt;
12256 if (Swap)
12257 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12258 if (IsUndefOrZero && Offset >= IndexLen)
12259 Offset = 255;
12260 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12261 }
12262 }
12263
12264 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12265 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12266
12267 SDValue Shuffle;
12268 if (IsUndefOrZero) {
12269 if (IndexLen == 8)
12270 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12271 Shuffle = DAG.getNode(
12272 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12273 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12274 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12275 } else {
12276 if (IndexLen == 8) {
12277 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12278 Shuffle = DAG.getNode(
12279 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12280 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12281 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12282 } else {
12283 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12284 // cannot currently represent the register constraints on the input
12285 // table registers.
12286 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12287 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12288 // IndexLen));
12289 Shuffle = DAG.getNode(
12290 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12291 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12292 V2Cst,
12293 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12294 }
12295 }
12296 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12297}
12298
12299static unsigned getDUPLANEOp(EVT EltType) {
12300 if (EltType == MVT::i8)
12301 return AArch64ISD::DUPLANE8;
12302 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12303 return AArch64ISD::DUPLANE16;
12304 if (EltType == MVT::i32 || EltType == MVT::f32)
12305 return AArch64ISD::DUPLANE32;
12306 if (EltType == MVT::i64 || EltType == MVT::f64)
12307 return AArch64ISD::DUPLANE64;
12308
12309 llvm_unreachable("Invalid vector element type?");
12310}
12311
12312static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12313 unsigned Opcode, SelectionDAG &DAG) {
12314 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12315 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12316 // Match: dup (bitcast (extract_subv X, C)), LaneC
12317 if (BitCast.getOpcode() != ISD::BITCAST ||
12319 return false;
12320
12321 // The extract index must align in the destination type. That may not
12322 // happen if the bitcast is from narrow to wide type.
12323 SDValue Extract = BitCast.getOperand(0);
12324 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12325 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12326 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12327 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12328 if (ExtIdxInBits % CastedEltBitWidth != 0)
12329 return false;
12330
12331 // Can't handle cases where vector size is not 128-bit
12332 if (!Extract.getOperand(0).getValueType().is128BitVector())
12333 return false;
12334
12335 // Update the lane value by offsetting with the scaled extract index.
12336 LaneC += ExtIdxInBits / CastedEltBitWidth;
12337
12338 // Determine the casted vector type of the wide vector input.
12339 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12340 // Examples:
12341 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12342 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12343 unsigned SrcVecNumElts =
12344 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12346 SrcVecNumElts);
12347 return true;
12348 };
12349 MVT CastVT;
12350 if (getScaledOffsetDup(V, Lane, CastVT)) {
12351 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12352 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12353 V.getOperand(0).getValueType().is128BitVector()) {
12354 // The lane is incremented by the index of the extract.
12355 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12356 Lane += V.getConstantOperandVal(1);
12357 V = V.getOperand(0);
12358 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12359 // The lane is decremented if we are splatting from the 2nd operand.
12360 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12361 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12362 Lane -= Idx * VT.getVectorNumElements() / 2;
12363 V = WidenVector(V.getOperand(Idx), DAG);
12364 } else if (VT.getSizeInBits() == 64) {
12365 // Widen the operand to 128-bit register with undef.
12366 V = WidenVector(V, DAG);
12367 }
12368 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12369}
12370
12371// Return true if we can get a new shuffle mask by checking the parameter mask
12372// array to test whether every two adjacent mask values are continuous and
12373// starting from an even number.
12375 SmallVectorImpl<int> &NewMask) {
12376 unsigned NumElts = VT.getVectorNumElements();
12377 if (NumElts % 2 != 0)
12378 return false;
12379
12380 NewMask.clear();
12381 for (unsigned i = 0; i < NumElts; i += 2) {
12382 int M0 = M[i];
12383 int M1 = M[i + 1];
12384
12385 // If both elements are undef, new mask is undef too.
12386 if (M0 == -1 && M1 == -1) {
12387 NewMask.push_back(-1);
12388 continue;
12389 }
12390
12391 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
12392 NewMask.push_back(M1 / 2);
12393 continue;
12394 }
12395
12396 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
12397 NewMask.push_back(M0 / 2);
12398 continue;
12399 }
12400
12401 NewMask.clear();
12402 return false;
12403 }
12404
12405 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
12406 return true;
12407}
12408
12409// Try to widen element type to get a new mask value for a better permutation
12410// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
12411// UZP1/2, TRN1/2, REV, INS, etc.
12412// For example:
12413// shufflevector <4 x i32> %a, <4 x i32> %b,
12414// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
12415// is equivalent to:
12416// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
12417// Finally, we can get:
12418// mov v0.d[0], v1.d[1]
12420 SDLoc DL(Op);
12421 EVT VT = Op.getValueType();
12422 EVT ScalarVT = VT.getVectorElementType();
12423 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
12424 SDValue V0 = Op.getOperand(0);
12425 SDValue V1 = Op.getOperand(1);
12426 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12427
12428 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
12429 // We need to make sure the wider element type is legal. Thus, ElementSize
12430 // should be not larger than 32 bits, and i1 type should also be excluded.
12431 if (ElementSize > 32 || ElementSize == 1)
12432 return SDValue();
12433
12434 SmallVector<int, 8> NewMask;
12435 if (isWideTypeMask(Mask, VT, NewMask)) {
12436 MVT NewEltVT = VT.isFloatingPoint()
12437 ? MVT::getFloatingPointVT(ElementSize * 2)
12438 : MVT::getIntegerVT(ElementSize * 2);
12439 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12440 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12441 V0 = DAG.getBitcast(NewVT, V0);
12442 V1 = DAG.getBitcast(NewVT, V1);
12443 return DAG.getBitcast(VT,
12444 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
12445 }
12446 }
12447
12448 return SDValue();
12449}
12450
12451// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
12453 ArrayRef<int> ShuffleMask,
12454 SelectionDAG &DAG) {
12455 SDValue Tbl1 = Op->getOperand(0);
12456 SDValue Tbl2 = Op->getOperand(1);
12457 SDLoc dl(Op);
12458 SDValue Tbl2ID =
12459 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
12460
12461 EVT VT = Op.getValueType();
12462 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
12463 Tbl1->getOperand(0) != Tbl2ID ||
12465 Tbl2->getOperand(0) != Tbl2ID)
12466 return SDValue();
12467
12468 if (Tbl1->getValueType(0) != MVT::v16i8 ||
12469 Tbl2->getValueType(0) != MVT::v16i8)
12470 return SDValue();
12471
12472 SDValue Mask1 = Tbl1->getOperand(3);
12473 SDValue Mask2 = Tbl2->getOperand(3);
12474 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
12475 for (unsigned I = 0; I < 16; I++) {
12476 if (ShuffleMask[I] < 16)
12477 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
12478 else {
12479 auto *C =
12480 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
12481 if (!C)
12482 return SDValue();
12483 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
12484 }
12485 }
12486
12487 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
12488 SDValue ID =
12489 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
12490
12491 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
12492 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
12493 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
12494}
12495
12496// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
12497// but we don't have an appropriate instruction,
12498// so custom-lower it as ZIP1-with-zeros.
12499SDValue
12500AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
12501 SelectionDAG &DAG) const {
12502 SDLoc dl(Op);
12503 EVT VT = Op.getValueType();
12504 SDValue SrcOp = Op.getOperand(0);
12505 EVT SrcVT = SrcOp.getValueType();
12506 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
12507 "Unexpected extension factor.");
12508 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
12509 // FIXME: support multi-step zipping?
12510 if (Scale != 2)
12511 return SDValue();
12512 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
12513 return DAG.getBitcast(VT,
12514 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
12515}
12516
12517SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
12518 SelectionDAG &DAG) const {
12519 SDLoc dl(Op);
12520 EVT VT = Op.getValueType();
12521
12522 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
12523
12524 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12525 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
12526
12527 // Convert shuffles that are directly supported on NEON to target-specific
12528 // DAG nodes, instead of keeping them as shuffles and matching them again
12529 // during code selection. This is more efficient and avoids the possibility
12530 // of inconsistencies between legalization and selection.
12531 ArrayRef<int> ShuffleMask = SVN->getMask();
12532
12533 SDValue V1 = Op.getOperand(0);
12534 SDValue V2 = Op.getOperand(1);
12535
12536 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
12537 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
12538 "Unexpected VECTOR_SHUFFLE mask size!");
12539
12540 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
12541 return Res;
12542
12543 if (SVN->isSplat()) {
12544 int Lane = SVN->getSplatIndex();
12545 // If this is undef splat, generate it via "just" vdup, if possible.
12546 if (Lane == -1)
12547 Lane = 0;
12548
12549 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
12550 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
12551 V1.getOperand(0));
12552 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
12553 // constant. If so, we can just reference the lane's definition directly.
12554 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
12555 !isa<ConstantSDNode>(V1.getOperand(Lane)))
12556 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
12557
12558 // Otherwise, duplicate from the lane of the input vector.
12559 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
12560 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
12561 }
12562
12563 // Check if the mask matches a DUP for a wider element
12564 for (unsigned LaneSize : {64U, 32U, 16U}) {
12565 unsigned Lane = 0;
12566 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
12567 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
12568 : LaneSize == 32 ? AArch64ISD::DUPLANE32
12570 // Cast V1 to an integer vector with required lane size
12571 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
12572 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
12573 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
12574 V1 = DAG.getBitcast(NewVecTy, V1);
12575 // Constuct the DUP instruction
12576 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
12577 // Cast back to the original type
12578 return DAG.getBitcast(VT, V1);
12579 }
12580 }
12581
12582 unsigned NumElts = VT.getVectorNumElements();
12583 unsigned EltSize = VT.getScalarSizeInBits();
12584 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
12585 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
12586 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
12587 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
12588 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
12589 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
12590
12591 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
12592 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
12593 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
12594 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
12595 DAG.getConstant(8, dl, MVT::i32));
12596 }
12597
12598 bool ReverseEXT = false;
12599 unsigned Imm;
12600 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
12601 if (ReverseEXT)
12602 std::swap(V1, V2);
12603 Imm *= getExtFactor(V1);
12604 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
12605 DAG.getConstant(Imm, dl, MVT::i32));
12606 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
12607 Imm *= getExtFactor(V1);
12608 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
12609 DAG.getConstant(Imm, dl, MVT::i32));
12610 }
12611
12612 unsigned WhichResult;
12613 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
12614 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12615 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12616 }
12617 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
12618 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12619 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12620 }
12621 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
12622 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12623 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
12624 }
12625
12626 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12627 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
12628 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12629 }
12630 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12631 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
12632 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12633 }
12634 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
12635 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
12636 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
12637 }
12638
12640 return Concat;
12641
12642 bool DstIsLeft;
12643 int Anomaly;
12644 int NumInputElements = V1.getValueType().getVectorNumElements();
12645 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
12646 SDValue DstVec = DstIsLeft ? V1 : V2;
12647 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
12648
12649 SDValue SrcVec = V1;
12650 int SrcLane = ShuffleMask[Anomaly];
12651 if (SrcLane >= NumInputElements) {
12652 SrcVec = V2;
12653 SrcLane -= NumElts;
12654 }
12655 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
12656
12657 EVT ScalarVT = VT.getVectorElementType();
12658
12659 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
12660 ScalarVT = MVT::i32;
12661
12662 return DAG.getNode(
12663 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
12664 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
12665 DstLaneV);
12666 }
12667
12668 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
12669 return NewSD;
12670
12671 // If the shuffle is not directly supported and it has 4 elements, use
12672 // the PerfectShuffle-generated table to synthesize it from other shuffles.
12673 if (NumElts == 4) {
12674 unsigned PFIndexes[4];
12675 for (unsigned i = 0; i != 4; ++i) {
12676 if (ShuffleMask[i] < 0)
12677 PFIndexes[i] = 8;
12678 else
12679 PFIndexes[i] = ShuffleMask[i];
12680 }
12681
12682 // Compute the index in the perfect shuffle table.
12683 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
12684 PFIndexes[2] * 9 + PFIndexes[3];
12685 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
12686 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
12687 dl);
12688 }
12689
12690 return GenerateTBL(Op, ShuffleMask, DAG);
12691}
12692
12693SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
12694 SelectionDAG &DAG) const {
12695 EVT VT = Op.getValueType();
12696
12697 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
12698 return LowerToScalableOp(Op, DAG);
12699
12700 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
12701 "Unexpected vector type!");
12702
12703 // We can handle the constant cases during isel.
12704 if (isa<ConstantSDNode>(Op.getOperand(0)))
12705 return Op;
12706
12707 // There isn't a natural way to handle the general i1 case, so we use some
12708 // trickery with whilelo.
12709 SDLoc DL(Op);
12710 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
12711 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
12712 DAG.getValueType(MVT::i1));
12713 SDValue ID =
12714 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
12715 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12716 if (VT == MVT::nxv1i1)
12717 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
12718 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
12719 Zero, SplatVal),
12720 Zero);
12721 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
12722}
12723
12724SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
12725 SelectionDAG &DAG) const {
12726 SDLoc DL(Op);
12727
12728 EVT VT = Op.getValueType();
12729 if (!isTypeLegal(VT) || !VT.isScalableVector())
12730 return SDValue();
12731
12732 // Current lowering only supports the SVE-ACLE types.
12734 return SDValue();
12735
12736 // The DUPQ operation is indepedent of element type so normalise to i64s.
12737 SDValue Idx128 = Op.getOperand(2);
12738
12739 // DUPQ can be used when idx is in range.
12740 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
12741 if (CIdx && (CIdx->getZExtValue() <= 3)) {
12742 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
12743 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
12744 }
12745
12746 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
12747
12748 // The ACLE says this must produce the same result as:
12749 // svtbl(data, svadd_x(svptrue_b64(),
12750 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
12751 // index * 2))
12752 SDValue One = DAG.getConstant(1, DL, MVT::i64);
12753 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
12754
12755 // create the vector 0,1,0,1,...
12756 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
12757 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
12758
12759 // create the vector idx64,idx64+1,idx64,idx64+1,...
12760 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
12761 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
12762 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
12763
12764 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
12765 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
12766 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
12767}
12768
12769
12770static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
12771 APInt &UndefBits) {
12772 EVT VT = BVN->getValueType(0);
12773 APInt SplatBits, SplatUndef;
12774 unsigned SplatBitSize;
12775 bool HasAnyUndefs;
12776 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
12777 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
12778
12779 for (unsigned i = 0; i < NumSplats; ++i) {
12780 CnstBits <<= SplatBitSize;
12781 UndefBits <<= SplatBitSize;
12782 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
12783 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
12784 }
12785
12786 return true;
12787 }
12788
12789 return false;
12790}
12791
12792// Try 64-bit splatted SIMD immediate.
12793static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12794 const APInt &Bits) {
12795 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12796 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12797 EVT VT = Op.getValueType();
12798 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
12799
12802
12803 SDLoc dl(Op);
12804 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12805 DAG.getConstant(Value, dl, MVT::i32));
12806 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12807 }
12808 }
12809
12810 return SDValue();
12811}
12812
12813// Try 32-bit splatted SIMD immediate.
12814static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12815 const APInt &Bits,
12816 const SDValue *LHS = nullptr) {
12817 EVT VT = Op.getValueType();
12818 if (VT.isFixedLengthVector() &&
12820 return SDValue();
12821
12822 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12823 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12824 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12825 bool isAdvSIMDModImm = false;
12826 uint64_t Shift;
12827
12828 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
12830 Shift = 0;
12831 }
12832 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
12834 Shift = 8;
12835 }
12836 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
12838 Shift = 16;
12839 }
12840 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
12842 Shift = 24;
12843 }
12844
12845 if (isAdvSIMDModImm) {
12846 SDLoc dl(Op);
12847 SDValue Mov;
12848
12849 if (LHS)
12850 Mov = DAG.getNode(NewOp, dl, MovTy,
12851 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12852 DAG.getConstant(Value, dl, MVT::i32),
12853 DAG.getConstant(Shift, dl, MVT::i32));
12854 else
12855 Mov = DAG.getNode(NewOp, dl, MovTy,
12856 DAG.getConstant(Value, dl, MVT::i32),
12857 DAG.getConstant(Shift, dl, MVT::i32));
12858
12859 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12860 }
12861 }
12862
12863 return SDValue();
12864}
12865
12866// Try 16-bit splatted SIMD immediate.
12867static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12868 const APInt &Bits,
12869 const SDValue *LHS = nullptr) {
12870 EVT VT = Op.getValueType();
12871 if (VT.isFixedLengthVector() &&
12873 return SDValue();
12874
12875 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12876 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12877 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
12878 bool isAdvSIMDModImm = false;
12879 uint64_t Shift;
12880
12881 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
12883 Shift = 0;
12884 }
12885 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
12887 Shift = 8;
12888 }
12889
12890 if (isAdvSIMDModImm) {
12891 SDLoc dl(Op);
12892 SDValue Mov;
12893
12894 if (LHS)
12895 Mov = DAG.getNode(NewOp, dl, MovTy,
12896 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
12897 DAG.getConstant(Value, dl, MVT::i32),
12898 DAG.getConstant(Shift, dl, MVT::i32));
12899 else
12900 Mov = DAG.getNode(NewOp, dl, MovTy,
12901 DAG.getConstant(Value, dl, MVT::i32),
12902 DAG.getConstant(Shift, dl, MVT::i32));
12903
12904 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12905 }
12906 }
12907
12908 return SDValue();
12909}
12910
12911// Try 32-bit splatted SIMD immediate with shifted ones.
12913 SelectionDAG &DAG, const APInt &Bits) {
12914 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12915 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12916 EVT VT = Op.getValueType();
12917 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
12918 bool isAdvSIMDModImm = false;
12919 uint64_t Shift;
12920
12921 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
12923 Shift = 264;
12924 }
12925 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
12927 Shift = 272;
12928 }
12929
12930 if (isAdvSIMDModImm) {
12931 SDLoc dl(Op);
12932 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12933 DAG.getConstant(Value, dl, MVT::i32),
12934 DAG.getConstant(Shift, dl, MVT::i32));
12935 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12936 }
12937 }
12938
12939 return SDValue();
12940}
12941
12942// Try 8-bit splatted SIMD immediate.
12943static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12944 const APInt &Bits) {
12945 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12946 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12947 EVT VT = Op.getValueType();
12948 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
12949
12952
12953 SDLoc dl(Op);
12954 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12955 DAG.getConstant(Value, dl, MVT::i32));
12956 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12957 }
12958 }
12959
12960 return SDValue();
12961}
12962
12963// Try FP splatted SIMD immediate.
12964static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
12965 const APInt &Bits) {
12966 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
12967 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
12968 EVT VT = Op.getValueType();
12969 bool isWide = (VT.getSizeInBits() == 128);
12970 MVT MovTy;
12971 bool isAdvSIMDModImm = false;
12972
12973 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
12975 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
12976 }
12977 else if (isWide &&
12978 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
12980 MovTy = MVT::v2f64;
12981 }
12982
12983 if (isAdvSIMDModImm) {
12984 SDLoc dl(Op);
12985 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
12986 DAG.getConstant(Value, dl, MVT::i32));
12987 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
12988 }
12989 }
12990
12991 return SDValue();
12992}
12993
12994// Specialized code to quickly find if PotentialBVec is a BuildVector that
12995// consists of only the same constant int value, returned in reference arg
12996// ConstVal
12997static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
12998 uint64_t &ConstVal) {
12999 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
13000 if (!Bvec)
13001 return false;
13002 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
13003 if (!FirstElt)
13004 return false;
13005 EVT VT = Bvec->getValueType(0);
13006 unsigned NumElts = VT.getVectorNumElements();
13007 for (unsigned i = 1; i < NumElts; ++i)
13008 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
13009 return false;
13010 ConstVal = FirstElt->getZExtValue();
13011 return true;
13012}
13013
13015 // Look through cast.
13016 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13017 N = N.getOperand(0);
13018
13019 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13020}
13021
13023 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13024
13025 // Look through cast.
13026 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13027 N = N.getOperand(0);
13028 // When reinterpreting from a type with fewer elements the "new" elements
13029 // are not active, so bail if they're likely to be used.
13030 if (N.getValueType().getVectorMinNumElements() < NumElts)
13031 return false;
13032 }
13033
13034 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13035 return true;
13036
13037 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13038 // or smaller than the implicit element type represented by N.
13039 // NOTE: A larger element count implies a smaller element type.
13040 if (N.getOpcode() == AArch64ISD::PTRUE &&
13041 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13042 return N.getValueType().getVectorMinNumElements() >= NumElts;
13043
13044 // If we're compiling for a specific vector-length, we can check if the
13045 // pattern's VL equals that of the scalable vector at runtime.
13046 if (N.getOpcode() == AArch64ISD::PTRUE) {
13047 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13048 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13049 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13050 if (MaxSVESize && MinSVESize == MaxSVESize) {
13051 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13052 unsigned PatNumElts =
13053 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13054 return PatNumElts == (NumElts * VScale);
13055 }
13056 }
13057
13058 return false;
13059}
13060
13061// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13062// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13063// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13064// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13065// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13066// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13068 EVT VT = N->getValueType(0);
13069
13070 if (!VT.isVector())
13071 return SDValue();
13072
13073 SDLoc DL(N);
13074
13075 SDValue And;
13076 SDValue Shift;
13077
13078 SDValue FirstOp = N->getOperand(0);
13079 unsigned FirstOpc = FirstOp.getOpcode();
13080 SDValue SecondOp = N->getOperand(1);
13081 unsigned SecondOpc = SecondOp.getOpcode();
13082
13083 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13084 // a BICi in order to use an immediate instead of a register.
13085 // Is the other operand an shl or lshr? This will have been turned into:
13086 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13087 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13088 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13089 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13090 SecondOpc == AArch64ISD::SHL_PRED ||
13091 SecondOpc == AArch64ISD::SRL_PRED)) {
13092 And = FirstOp;
13093 Shift = SecondOp;
13094
13095 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13096 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13097 FirstOpc == AArch64ISD::SHL_PRED ||
13098 FirstOpc == AArch64ISD::SRL_PRED)) {
13099 And = SecondOp;
13100 Shift = FirstOp;
13101 } else
13102 return SDValue();
13103
13104 bool IsAnd = And.getOpcode() == ISD::AND;
13105 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13107 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13109
13110 // Is the shift amount constant and are all lanes active?
13111 uint64_t C2;
13112 if (ShiftHasPredOp) {
13113 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13114 return SDValue();
13115 APInt C;
13117 return SDValue();
13118 C2 = C.getZExtValue();
13119 } else if (ConstantSDNode *C2node =
13120 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13121 C2 = C2node->getZExtValue();
13122 else
13123 return SDValue();
13124
13125 APInt C1AsAPInt;
13126 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13127 if (IsAnd) {
13128 // Is the and mask vector all constant?
13129 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13130 return SDValue();
13131 } else {
13132 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13133 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13134 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13135 assert(C1nodeImm && C1nodeShift);
13136 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13137 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13138 }
13139
13140 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13141 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13142 // how much one can shift elements of a particular size?
13143 if (C2 > ElemSizeInBits)
13144 return SDValue();
13145
13146 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13147 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13148 if (C1AsAPInt != RequiredC1)
13149 return SDValue();
13150
13151 SDValue X = And.getOperand(0);
13152 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13153 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13154 : Shift.getOperand(1);
13155
13156 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13157 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13158
13159 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13160 LLVM_DEBUG(N->dump(&DAG));
13161 LLVM_DEBUG(dbgs() << "into: \n");
13162 LLVM_DEBUG(ResultSLI->dump(&DAG));
13163
13164 ++NumShiftInserts;
13165 return ResultSLI;
13166}
13167
13168SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13169 SelectionDAG &DAG) const {
13170 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13171 !Subtarget->isNeonAvailable()))
13172 return LowerToScalableOp(Op, DAG);
13173
13174 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13175 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13176 return Res;
13177
13178 EVT VT = Op.getValueType();
13179 if (VT.isScalableVector())
13180 return Op;
13181
13182 SDValue LHS = Op.getOperand(0);
13183 BuildVectorSDNode *BVN =
13184 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13185 if (!BVN) {
13186 // OR commutes, so try swapping the operands.
13187 LHS = Op.getOperand(1);
13188 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13189 }
13190 if (!BVN)
13191 return Op;
13192
13193 APInt DefBits(VT.getSizeInBits(), 0);
13194 APInt UndefBits(VT.getSizeInBits(), 0);
13195 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13196 SDValue NewOp;
13197
13198 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13199 DefBits, &LHS)) ||
13200 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13201 DefBits, &LHS)))
13202 return NewOp;
13203
13204 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13205 UndefBits, &LHS)) ||
13206 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13207 UndefBits, &LHS)))
13208 return NewOp;
13209 }
13210
13211 // We can always fall back to a non-immediate OR.
13212 return Op;
13213}
13214
13215// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13216// be truncated to fit element width.
13218 SelectionDAG &DAG) {
13219 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13220 SDLoc dl(Op);
13221 EVT VT = Op.getValueType();
13222 EVT EltTy= VT.getVectorElementType();
13223
13224 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13225 return Op;
13226
13228 for (SDValue Lane : Op->ops()) {
13229 // For integer vectors, type legalization would have promoted the
13230 // operands already. Otherwise, if Op is a floating-point splat
13231 // (with operands cast to integers), then the only possibilities
13232 // are constants and UNDEFs.
13233 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13234 APInt LowBits(EltTy.getSizeInBits(),
13235 CstLane->getZExtValue());
13236 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13237 } else if (Lane.getNode()->isUndef()) {
13238 Lane = DAG.getUNDEF(MVT::i32);
13239 } else {
13240 assert(Lane.getValueType() == MVT::i32 &&
13241 "Unexpected BUILD_VECTOR operand type");
13242 }
13243 Ops.push_back(Lane);
13244 }
13245 return DAG.getBuildVector(VT, dl, Ops);
13246}
13247
13249 const AArch64Subtarget *ST) {
13250 EVT VT = Op.getValueType();
13251 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13252 "Expected a legal NEON vector");
13253
13254 APInt DefBits(VT.getSizeInBits(), 0);
13255 APInt UndefBits(VT.getSizeInBits(), 0);
13256 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13257 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13258 auto TryMOVIWithBits = [&](APInt DefBits) {
13259 SDValue NewOp;
13260 if ((NewOp =
13261 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13262 (NewOp =
13263 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13264 (NewOp =
13265 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13266 (NewOp =
13267 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13268 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13269 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13270 return NewOp;
13271
13272 APInt NotDefBits = ~DefBits;
13273 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13274 NotDefBits)) ||
13276 NotDefBits)) ||
13277 (NewOp =
13278 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13279 return NewOp;
13280 return SDValue();
13281 };
13282 if (SDValue R = TryMOVIWithBits(DefBits))
13283 return R;
13284 if (SDValue R = TryMOVIWithBits(UndefBits))
13285 return R;
13286
13287 // See if a fneg of the constant can be materialized with a MOVI, etc
13288 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13289 // FNegate each sub-element of the constant
13290 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13291 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13292 .zext(VT.getSizeInBits());
13293 APInt NegBits(VT.getSizeInBits(), 0);
13294 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13295 for (unsigned i = 0; i < NumElts; i++)
13296 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13297 NegBits = DefBits ^ NegBits;
13298
13299 // Try to create the new constants with MOVI, and if so generate a fneg
13300 // for it.
13301 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13302 SDLoc DL(Op);
13303 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13304 return DAG.getNode(
13306 DAG.getNode(ISD::FNEG, DL, VFVT,
13307 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13308 }
13309 return SDValue();
13310 };
13311 SDValue R;
13312 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13313 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13314 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13315 return R;
13316 }
13317
13318 return SDValue();
13319}
13320
13321SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13322 SelectionDAG &DAG) const {
13323 EVT VT = Op.getValueType();
13324
13325 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13326 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13327 SDLoc DL(Op);
13328 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13329 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13330 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13331 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13332 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13333 }
13334
13335 // Revert to common legalisation for all other variants.
13336 return SDValue();
13337 }
13338
13339 // Try to build a simple constant vector.
13340 Op = NormalizeBuildVector(Op, DAG);
13341 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13342 // abort.
13343 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13344 return SDValue();
13345
13346 // Certain vector constants, used to express things like logical NOT and
13347 // arithmetic NEG, are passed through unmodified. This allows special
13348 // patterns for these operations to match, which will lower these constants
13349 // to whatever is proven necessary.
13350 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13351 if (BVN->isConstant()) {
13352 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13353 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13354 APInt Val(BitSize,
13355 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13356 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13357 return Op;
13358 }
13359 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13360 if (Const->isZero() && !Const->isNegative())
13361 return Op;
13362 }
13363
13364 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13365 return V;
13366
13367 // Scan through the operands to find some interesting properties we can
13368 // exploit:
13369 // 1) If only one value is used, we can use a DUP, or
13370 // 2) if only the low element is not undef, we can just insert that, or
13371 // 3) if only one constant value is used (w/ some non-constant lanes),
13372 // we can splat the constant value into the whole vector then fill
13373 // in the non-constant lanes.
13374 // 4) FIXME: If different constant values are used, but we can intelligently
13375 // select the values we'll be overwriting for the non-constant
13376 // lanes such that we can directly materialize the vector
13377 // some other way (MOVI, e.g.), we can be sneaky.
13378 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13379 SDLoc dl(Op);
13380 unsigned NumElts = VT.getVectorNumElements();
13381 bool isOnlyLowElement = true;
13382 bool usesOnlyOneValue = true;
13383 bool usesOnlyOneConstantValue = true;
13384 bool isConstant = true;
13385 bool AllLanesExtractElt = true;
13386 unsigned NumConstantLanes = 0;
13387 unsigned NumDifferentLanes = 0;
13388 unsigned NumUndefLanes = 0;
13389 SDValue Value;
13390 SDValue ConstantValue;
13391 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
13392 unsigned ConsecutiveValCount = 0;
13393 SDValue PrevVal;
13394 for (unsigned i = 0; i < NumElts; ++i) {
13395 SDValue V = Op.getOperand(i);
13396 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13397 AllLanesExtractElt = false;
13398 if (V.isUndef()) {
13399 ++NumUndefLanes;
13400 continue;
13401 }
13402 if (i > 0)
13403 isOnlyLowElement = false;
13404 if (!isIntOrFPConstant(V))
13405 isConstant = false;
13406
13407 if (isIntOrFPConstant(V)) {
13408 ++NumConstantLanes;
13409 if (!ConstantValue.getNode())
13410 ConstantValue = V;
13411 else if (ConstantValue != V)
13412 usesOnlyOneConstantValue = false;
13413 }
13414
13415 if (!Value.getNode())
13416 Value = V;
13417 else if (V != Value) {
13418 usesOnlyOneValue = false;
13419 ++NumDifferentLanes;
13420 }
13421
13422 if (PrevVal != V) {
13423 ConsecutiveValCount = 0;
13424 PrevVal = V;
13425 }
13426
13427 // Keep different values and its last consecutive count. For example,
13428 //
13429 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13430 // t24, t24, t24, t24, t24, t24, t24, t24
13431 // t23 = consecutive count 8
13432 // t24 = consecutive count 8
13433 // ------------------------------------------------------------------
13434 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
13435 // t24, t24, t24, t24, t24, t24, t24, t24
13436 // t23 = consecutive count 5
13437 // t24 = consecutive count 9
13438 DifferentValueMap[V] = ++ConsecutiveValCount;
13439 }
13440
13441 if (!Value.getNode()) {
13442 LLVM_DEBUG(
13443 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
13444 return DAG.getUNDEF(VT);
13445 }
13446
13447 // Convert BUILD_VECTOR where all elements but the lowest are undef into
13448 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
13449 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
13450 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
13451 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
13452 "SCALAR_TO_VECTOR node\n");
13453 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
13454 }
13455
13456 if (AllLanesExtractElt) {
13457 SDNode *Vector = nullptr;
13458 bool Even = false;
13459 bool Odd = false;
13460 // Check whether the extract elements match the Even pattern <0,2,4,...> or
13461 // the Odd pattern <1,3,5,...>.
13462 for (unsigned i = 0; i < NumElts; ++i) {
13463 SDValue V = Op.getOperand(i);
13464 const SDNode *N = V.getNode();
13465 if (!isa<ConstantSDNode>(N->getOperand(1))) {
13466 Even = false;
13467 Odd = false;
13468 break;
13469 }
13470 SDValue N0 = N->getOperand(0);
13471
13472 // All elements are extracted from the same vector.
13473 if (!Vector) {
13474 Vector = N0.getNode();
13475 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
13476 // BUILD_VECTOR.
13477 if (VT.getVectorElementType() !=
13479 break;
13480 } else if (Vector != N0.getNode()) {
13481 Odd = false;
13482 Even = false;
13483 break;
13484 }
13485
13486 // Extracted values are either at Even indices <0,2,4,...> or at Odd
13487 // indices <1,3,5,...>.
13488 uint64_t Val = N->getConstantOperandVal(1);
13489 if (Val == 2 * i) {
13490 Even = true;
13491 continue;
13492 }
13493 if (Val - 1 == 2 * i) {
13494 Odd = true;
13495 continue;
13496 }
13497
13498 // Something does not match: abort.
13499 Odd = false;
13500 Even = false;
13501 break;
13502 }
13503 if (Even || Odd) {
13504 SDValue LHS =
13506 DAG.getConstant(0, dl, MVT::i64));
13507 SDValue RHS =
13509 DAG.getConstant(NumElts, dl, MVT::i64));
13510
13511 if (Even && !Odd)
13512 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
13513 RHS);
13514 if (Odd && !Even)
13515 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
13516 RHS);
13517 }
13518 }
13519
13520 // Use DUP for non-constant splats. For f32 constant splats, reduce to
13521 // i32 and try again.
13522 if (usesOnlyOneValue) {
13523 if (!isConstant) {
13524 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13525 Value.getValueType() != VT) {
13526 LLVM_DEBUG(
13527 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
13528 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
13529 }
13530
13531 // This is actually a DUPLANExx operation, which keeps everything vectory.
13532
13533 SDValue Lane = Value.getOperand(1);
13534 Value = Value.getOperand(0);
13535 if (Value.getValueSizeInBits() == 64) {
13536 LLVM_DEBUG(
13537 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
13538 "widening it\n");
13539 Value = WidenVector(Value, DAG);
13540 }
13541
13542 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
13543 return DAG.getNode(Opcode, dl, VT, Value, Lane);
13544 }
13545
13548 EVT EltTy = VT.getVectorElementType();
13549 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
13550 EltTy == MVT::f64) && "Unsupported floating-point vector type");
13551 LLVM_DEBUG(
13552 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
13553 "BITCASTS, and try again\n");
13554 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
13555 for (unsigned i = 0; i < NumElts; ++i)
13556 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
13557 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
13558 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
13559 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
13560 Val.dump(););
13561 Val = LowerBUILD_VECTOR(Val, DAG);
13562 if (Val.getNode())
13563 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
13564 }
13565 }
13566
13567 // If we need to insert a small number of different non-constant elements and
13568 // the vector width is sufficiently large, prefer using DUP with the common
13569 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
13570 // skip the constant lane handling below.
13571 bool PreferDUPAndInsert =
13572 !isConstant && NumDifferentLanes >= 1 &&
13573 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
13574 NumDifferentLanes >= NumConstantLanes;
13575
13576 // If there was only one constant value used and for more than one lane,
13577 // start by splatting that value, then replace the non-constant lanes. This
13578 // is better than the default, which will perform a separate initialization
13579 // for each lane.
13580 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
13581 // Firstly, try to materialize the splat constant.
13582 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
13583 unsigned BitSize = VT.getScalarSizeInBits();
13584 APInt ConstantValueAPInt(1, 0);
13585 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
13586 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
13587 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
13588 !ConstantValueAPInt.isAllOnes()) {
13589 Val = ConstantBuildVector(Val, DAG, Subtarget);
13590 if (!Val)
13591 // Otherwise, materialize the constant and splat it.
13592 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
13593 }
13594
13595 // Now insert the non-constant lanes.
13596 for (unsigned i = 0; i < NumElts; ++i) {
13597 SDValue V = Op.getOperand(i);
13598 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13599 if (!isIntOrFPConstant(V))
13600 // Note that type legalization likely mucked about with the VT of the
13601 // source operand, so we may have to convert it here before inserting.
13602 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
13603 }
13604 return Val;
13605 }
13606
13607 // This will generate a load from the constant pool.
13608 if (isConstant) {
13609 LLVM_DEBUG(
13610 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
13611 "expansion\n");
13612 return SDValue();
13613 }
13614
13615 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13616 // v4i32s. This is really a truncate, which we can construct out of (legal)
13617 // concats and truncate nodes.
13619 return M;
13620
13621 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
13622 if (NumElts >= 4) {
13623 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
13624 return Shuffle;
13625
13626 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
13627 return Shuffle;
13628 }
13629
13630 if (PreferDUPAndInsert) {
13631 // First, build a constant vector with the common element.
13632 SmallVector<SDValue, 8> Ops(NumElts, Value);
13633 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
13634 // Next, insert the elements that do not match the common value.
13635 for (unsigned I = 0; I < NumElts; ++I)
13636 if (Op.getOperand(I) != Value)
13637 NewVector =
13638 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
13639 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
13640
13641 return NewVector;
13642 }
13643
13644 // If vector consists of two different values, try to generate two DUPs and
13645 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
13646 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
13648 // Check the consecutive count of the value is the half number of vector
13649 // elements. In this case, we can use CONCAT_VECTORS. For example,
13650 //
13651 // canUseVECTOR_CONCAT = true;
13652 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
13653 // t24, t24, t24, t24, t24, t24, t24, t24
13654 //
13655 // canUseVECTOR_CONCAT = false;
13656 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
13657 // t24, t24, t24, t24, t24, t24, t24, t24
13658 bool canUseVECTOR_CONCAT = true;
13659 for (auto Pair : DifferentValueMap) {
13660 // Check different values have same length which is NumElts / 2.
13661 if (Pair.second != NumElts / 2)
13662 canUseVECTOR_CONCAT = false;
13663 Vals.push_back(Pair.first);
13664 }
13665
13666 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
13667 // CONCAT_VECTORs. For example,
13668 //
13669 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
13670 // t24, t24, t24, t24, t24, t24, t24, t24
13671 // ==>
13672 // t26: v8i8 = AArch64ISD::DUP t23
13673 // t28: v8i8 = AArch64ISD::DUP t24
13674 // t29: v16i8 = concat_vectors t26, t28
13675 if (canUseVECTOR_CONCAT) {
13676 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13677 if (isTypeLegal(SubVT) && SubVT.isVector() &&
13678 SubVT.getVectorNumElements() >= 2) {
13679 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
13680 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
13681 SDValue DUP1 =
13682 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
13683 SDValue DUP2 =
13684 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
13686 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
13687 return CONCAT_VECTORS;
13688 }
13689 }
13690
13691 // Let's try to generate VECTOR_SHUFFLE. For example,
13692 //
13693 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
13694 // ==>
13695 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
13696 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
13697 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
13698 if (NumElts >= 8) {
13699 SmallVector<int, 16> MaskVec;
13700 // Build mask for VECTOR_SHUFLLE.
13701 SDValue FirstLaneVal = Op.getOperand(0);
13702 for (unsigned i = 0; i < NumElts; ++i) {
13703 SDValue Val = Op.getOperand(i);
13704 if (FirstLaneVal == Val)
13705 MaskVec.push_back(i);
13706 else
13707 MaskVec.push_back(i + NumElts);
13708 }
13709
13710 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
13711 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
13712 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
13713 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
13715 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
13716 return VECTOR_SHUFFLE;
13717 }
13718 }
13719
13720 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
13721 // know the default expansion would otherwise fall back on something even
13722 // worse. For a vector with one or two non-undef values, that's
13723 // scalar_to_vector for the elements followed by a shuffle (provided the
13724 // shuffle is valid for the target) and materialization element by element
13725 // on the stack followed by a load for everything else.
13726 if (!isConstant && !usesOnlyOneValue) {
13727 LLVM_DEBUG(
13728 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
13729 "of INSERT_VECTOR_ELT\n");
13730
13731 SDValue Vec = DAG.getUNDEF(VT);
13732 SDValue Op0 = Op.getOperand(0);
13733 unsigned i = 0;
13734
13735 // Use SCALAR_TO_VECTOR for lane zero to
13736 // a) Avoid a RMW dependency on the full vector register, and
13737 // b) Allow the register coalescer to fold away the copy if the
13738 // value is already in an S or D register, and we're forced to emit an
13739 // INSERT_SUBREG that we can't fold anywhere.
13740 //
13741 // We also allow types like i8 and i16 which are illegal scalar but legal
13742 // vector element types. After type-legalization the inserted value is
13743 // extended (i32) and it is safe to cast them to the vector type by ignoring
13744 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
13745 if (!Op0.isUndef()) {
13746 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
13747 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
13748 ++i;
13749 }
13750 LLVM_DEBUG(if (i < NumElts) dbgs()
13751 << "Creating nodes for the other vector elements:\n";);
13752 for (; i < NumElts; ++i) {
13753 SDValue V = Op.getOperand(i);
13754 if (V.isUndef())
13755 continue;
13756 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
13757 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
13758 }
13759 return Vec;
13760 }
13761
13762 LLVM_DEBUG(
13763 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
13764 "better alternative\n");
13765 return SDValue();
13766}
13767
13768SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
13769 SelectionDAG &DAG) const {
13770 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13771 !Subtarget->isNeonAvailable()))
13772 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
13773
13774 assert(Op.getValueType().isScalableVector() &&
13775 isTypeLegal(Op.getValueType()) &&
13776 "Expected legal scalable vector type!");
13777
13778 if (isTypeLegal(Op.getOperand(0).getValueType())) {
13779 unsigned NumOperands = Op->getNumOperands();
13780 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
13781 "Unexpected number of operands in CONCAT_VECTORS");
13782
13783 if (NumOperands == 2)
13784 return Op;
13785
13786 // Concat each pair of subvectors and pack into the lower half of the array.
13787 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
13788 while (ConcatOps.size() > 1) {
13789 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
13790 SDValue V1 = ConcatOps[I];
13791 SDValue V2 = ConcatOps[I + 1];
13792 EVT SubVT = V1.getValueType();
13793 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
13794 ConcatOps[I / 2] =
13795 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
13796 }
13797 ConcatOps.resize(ConcatOps.size() / 2);
13798 }
13799 return ConcatOps[0];
13800 }
13801
13802 return SDValue();
13803}
13804
13805SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13806 SelectionDAG &DAG) const {
13807 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
13808
13809 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13810 !Subtarget->isNeonAvailable()))
13811 return LowerFixedLengthInsertVectorElt(Op, DAG);
13812
13813 EVT VT = Op.getOperand(0).getValueType();
13814
13815 if (VT.getScalarType() == MVT::i1) {
13816 EVT VectorVT = getPromotedVTForPredicate(VT);
13817 SDLoc DL(Op);
13818 SDValue ExtendedVector =
13819 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
13820 SDValue ExtendedValue =
13821 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
13822 VectorVT.getScalarType().getSizeInBits() < 32
13823 ? MVT::i32
13824 : VectorVT.getScalarType());
13825 ExtendedVector =
13826 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
13827 ExtendedValue, Op.getOperand(2));
13828 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
13829 }
13830
13831 // Check for non-constant or out of range lane.
13832 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
13833 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13834 return SDValue();
13835
13836 return Op;
13837}
13838
13839SDValue
13840AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13841 SelectionDAG &DAG) const {
13842 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
13843 EVT VT = Op.getOperand(0).getValueType();
13844
13845 if (VT.getScalarType() == MVT::i1) {
13846 // We can't directly extract from an SVE predicate; extend it first.
13847 // (This isn't the only possible lowering, but it's straightforward.)
13848 EVT VectorVT = getPromotedVTForPredicate(VT);
13849 SDLoc DL(Op);
13850 SDValue Extend =
13851 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
13852 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
13853 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
13854 Extend, Op.getOperand(1));
13855 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
13856 }
13857
13858 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13859 return LowerFixedLengthExtractVectorElt(Op, DAG);
13860
13861 // Check for non-constant or out of range lane.
13862 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
13863 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
13864 return SDValue();
13865
13866 // Insertion/extraction are legal for V128 types.
13867 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
13868 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
13869 VT == MVT::v8f16 || VT == MVT::v8bf16)
13870 return Op;
13871
13872 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
13873 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
13874 VT != MVT::v4bf16)
13875 return SDValue();
13876
13877 // For V64 types, we perform extraction by expanding the value
13878 // to a V128 type and perform the extraction on that.
13879 SDLoc DL(Op);
13880 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
13881 EVT WideTy = WideVec.getValueType();
13882
13883 EVT ExtrTy = WideTy.getVectorElementType();
13884 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
13885 ExtrTy = MVT::i32;
13886
13887 // For extractions, we just return the result directly.
13888 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
13889 Op.getOperand(1));
13890}
13891
13892SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
13893 SelectionDAG &DAG) const {
13894 assert(Op.getValueType().isFixedLengthVector() &&
13895 "Only cases that extract a fixed length vector are supported!");
13896
13897 EVT InVT = Op.getOperand(0).getValueType();
13898 unsigned Idx = Op.getConstantOperandVal(1);
13899 unsigned Size = Op.getValueSizeInBits();
13900
13901 // If we don't have legal types yet, do nothing
13902 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
13903 return SDValue();
13904
13905 if (InVT.isScalableVector()) {
13906 // This will be matched by custom code during ISelDAGToDAG.
13907 if (Idx == 0 && isPackedVectorType(InVT, DAG))
13908 return Op;
13909
13910 return SDValue();
13911 }
13912
13913 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
13914 if (Idx == 0 && InVT.getSizeInBits() <= 128)
13915 return Op;
13916
13917 // If this is extracting the upper 64-bits of a 128-bit vector, we match
13918 // that directly.
13919 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
13920 InVT.getSizeInBits() == 128 && Subtarget->isNeonAvailable())
13921 return Op;
13922
13923 if (useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
13924 SDLoc DL(Op);
13925
13926 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
13927 SDValue NewInVec =
13928 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
13929
13930 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
13931 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
13932 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
13933 }
13934
13935 return SDValue();
13936}
13937
13938SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
13939 SelectionDAG &DAG) const {
13940 assert(Op.getValueType().isScalableVector() &&
13941 "Only expect to lower inserts into scalable vectors!");
13942
13943 EVT InVT = Op.getOperand(1).getValueType();
13944 unsigned Idx = Op.getConstantOperandVal(2);
13945
13946 SDValue Vec0 = Op.getOperand(0);
13947 SDValue Vec1 = Op.getOperand(1);
13948 SDLoc DL(Op);
13949 EVT VT = Op.getValueType();
13950
13951 if (InVT.isScalableVector()) {
13952 if (!isTypeLegal(VT))
13953 return SDValue();
13954
13955 // Break down insert_subvector into simpler parts.
13956 if (VT.getVectorElementType() == MVT::i1) {
13957 unsigned NumElts = VT.getVectorMinNumElements();
13958 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
13959
13960 SDValue Lo, Hi;
13961 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13962 DAG.getVectorIdxConstant(0, DL));
13963 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
13964 DAG.getVectorIdxConstant(NumElts / 2, DL));
13965 if (Idx < (NumElts / 2))
13966 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
13968 else
13969 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
13970 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
13971
13972 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13973 }
13974
13975 // Ensure the subvector is half the size of the main vector.
13976 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
13977 return SDValue();
13978
13979 // Here narrow and wide refers to the vector element types. After "casting"
13980 // both vectors must have the same bit length and so because the subvector
13981 // has fewer elements, those elements need to be bigger.
13984
13985 // NOP cast operands to the largest legal vector of the same element count.
13986 if (VT.isFloatingPoint()) {
13987 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
13988 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
13989 } else {
13990 // Legal integer vectors are already their largest so Vec0 is fine as is.
13991 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
13992 }
13993
13994 // To replace the top/bottom half of vector V with vector SubV we widen the
13995 // preserved half of V, concatenate this to SubV (the order depending on the
13996 // half being replaced) and then narrow the result.
13997 SDValue Narrow;
13998 if (Idx == 0) {
13999 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
14000 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
14001 } else {
14003 "Invalid subvector index!");
14004 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
14005 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
14006 }
14007
14008 return getSVESafeBitCast(VT, Narrow, DAG);
14009 }
14010
14011 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14012 // This will be matched by custom code during ISelDAGToDAG.
14013 if (Vec0.isUndef())
14014 return Op;
14015
14016 std::optional<unsigned> PredPattern =
14018 auto PredTy = VT.changeVectorElementType(MVT::i1);
14019 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14020 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14021 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14022 }
14023
14024 return SDValue();
14025}
14026
14027static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14028 if (Op.getOpcode() != AArch64ISD::DUP &&
14029 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14030 Op.getOpcode() != ISD::BUILD_VECTOR)
14031 return false;
14032
14033 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14034 !isAllConstantBuildVector(Op, SplatVal))
14035 return false;
14036
14037 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14038 !isa<ConstantSDNode>(Op->getOperand(0)))
14039 return false;
14040
14041 SplatVal = Op->getConstantOperandVal(0);
14042 if (Op.getValueType().getVectorElementType() != MVT::i64)
14043 SplatVal = (int32_t)SplatVal;
14044
14045 Negated = false;
14046 if (isPowerOf2_64(SplatVal))
14047 return true;
14048
14049 Negated = true;
14050 if (isPowerOf2_64(-SplatVal)) {
14051 SplatVal = -SplatVal;
14052 return true;
14053 }
14054
14055 return false;
14056}
14057
14058SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14059 EVT VT = Op.getValueType();
14060 SDLoc dl(Op);
14061
14062 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14063 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14064
14065 assert(VT.isScalableVector() && "Expected a scalable vector.");
14066
14067 bool Signed = Op.getOpcode() == ISD::SDIV;
14068 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14069
14070 bool Negated;
14071 uint64_t SplatVal;
14072 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14073 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14074 SDValue Res =
14075 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14076 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14077 if (Negated)
14078 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14079
14080 return Res;
14081 }
14082
14083 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14084 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14085
14086 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14087 // operations, and truncate the result.
14088 EVT WidenedVT;
14089 if (VT == MVT::nxv16i8)
14090 WidenedVT = MVT::nxv8i16;
14091 else if (VT == MVT::nxv8i16)
14092 WidenedVT = MVT::nxv4i32;
14093 else
14094 llvm_unreachable("Unexpected Custom DIV operation");
14095
14096 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14097 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14098 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14099 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14100 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14101 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14102 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14103 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14104 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14105}
14106
14108 // Currently no fixed length shuffles that require SVE are legal.
14109 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14110 return false;
14111
14112 if (VT.getVectorNumElements() == 4 &&
14113 (VT.is128BitVector() || VT.is64BitVector())) {
14114 unsigned Cost = getPerfectShuffleCost(M);
14115 if (Cost <= 1)
14116 return true;
14117 }
14118
14119 bool DummyBool;
14120 int DummyInt;
14121 unsigned DummyUnsigned;
14122
14123 unsigned EltSize = VT.getScalarSizeInBits();
14124 unsigned NumElts = VT.getVectorNumElements();
14125 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
14126 isREVMask(M, EltSize, NumElts, 64) ||
14127 isREVMask(M, EltSize, NumElts, 32) ||
14128 isREVMask(M, EltSize, NumElts, 16) ||
14129 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14130 isTRNMask(M, NumElts, DummyUnsigned) ||
14131 isUZPMask(M, NumElts, DummyUnsigned) ||
14132 isZIPMask(M, NumElts, DummyUnsigned) ||
14133 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14134 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14135 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14136 isINSMask(M, NumElts, DummyBool, DummyInt) ||
14137 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14138}
14139
14141 EVT VT) const {
14142 // Just delegate to the generic legality, clear masks aren't special.
14143 return isShuffleMaskLegal(M, VT);
14144}
14145
14146/// getVShiftImm - Check if this is a valid build_vector for the immediate
14147/// operand of a vector shift operation, where all the elements of the
14148/// build_vector must have the same constant integer value.
14149static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14150 // Ignore bit_converts.
14151 while (Op.getOpcode() == ISD::BITCAST)
14152 Op = Op.getOperand(0);
14153 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14154 APInt SplatBits, SplatUndef;
14155 unsigned SplatBitSize;
14156 bool HasAnyUndefs;
14157 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14158 HasAnyUndefs, ElementBits) ||
14159 SplatBitSize > ElementBits)
14160 return false;
14161 Cnt = SplatBits.getSExtValue();
14162 return true;
14163}
14164
14165/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14166/// operand of a vector shift left operation. That value must be in the range:
14167/// 0 <= Value < ElementBits for a left shift; or
14168/// 0 <= Value <= ElementBits for a long left shift.
14169static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14170 assert(VT.isVector() && "vector shift count is not a vector type");
14171 int64_t ElementBits = VT.getScalarSizeInBits();
14172 if (!getVShiftImm(Op, ElementBits, Cnt))
14173 return false;
14174 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14175}
14176
14177/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14178/// operand of a vector shift right operation. The value must be in the range:
14179/// 1 <= Value <= ElementBits for a right shift; or
14180static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14181 assert(VT.isVector() && "vector shift count is not a vector type");
14182 int64_t ElementBits = VT.getScalarSizeInBits();
14183 if (!getVShiftImm(Op, ElementBits, Cnt))
14184 return false;
14185 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14186}
14187
14188SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14189 SelectionDAG &DAG) const {
14190 EVT VT = Op.getValueType();
14191
14192 if (VT.getScalarType() == MVT::i1) {
14193 // Lower i1 truncate to `(x & 1) != 0`.
14194 SDLoc dl(Op);
14195 EVT OpVT = Op.getOperand(0).getValueType();
14196 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14197 SDValue One = DAG.getConstant(1, dl, OpVT);
14198 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14199 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14200 }
14201
14202 if (!VT.isVector() || VT.isScalableVector())
14203 return SDValue();
14204
14205 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14206 !Subtarget->isNeonAvailable()))
14207 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14208
14209 return SDValue();
14210}
14211
14212// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14213// possibly a truncated type, it tells how many bits of the value are to be
14214// used.
14216 SelectionDAG &DAG,
14217 unsigned &ShiftValue,
14218 SDValue &RShOperand) {
14219 if (Shift->getOpcode() != ISD::SRL)
14220 return false;
14221
14222 EVT VT = Shift.getValueType();
14223 assert(VT.isScalableVT());
14224
14225 auto ShiftOp1 =
14226 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14227 if (!ShiftOp1)
14228 return false;
14229
14230 ShiftValue = ShiftOp1->getZExtValue();
14231 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14232 return false;
14233
14234 SDValue Add = Shift->getOperand(0);
14235 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14236 return false;
14237
14239 "ResVT must be truncated or same type as the shift.");
14240 // Check if an overflow can lead to incorrect results.
14241 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14242 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14243 return false;
14244
14245 auto AddOp1 =
14246 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14247 if (!AddOp1)
14248 return false;
14249 uint64_t AddValue = AddOp1->getZExtValue();
14250 if (AddValue != 1ULL << (ShiftValue - 1))
14251 return false;
14252
14253 RShOperand = Add->getOperand(0);
14254 return true;
14255}
14256
14257SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14258 SelectionDAG &DAG) const {
14259 EVT VT = Op.getValueType();
14260 SDLoc DL(Op);
14261 int64_t Cnt;
14262
14263 if (!Op.getOperand(1).getValueType().isVector())
14264 return Op;
14265 unsigned EltSize = VT.getScalarSizeInBits();
14266
14267 switch (Op.getOpcode()) {
14268 case ISD::SHL:
14269 if (VT.isScalableVector() ||
14271 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14272
14273 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14274 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14275 DAG.getConstant(Cnt, DL, MVT::i32));
14276 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14277 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14278 MVT::i32),
14279 Op.getOperand(0), Op.getOperand(1));
14280 case ISD::SRA:
14281 case ISD::SRL:
14282 if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
14283 SDValue RShOperand;
14284 unsigned ShiftValue;
14285 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14286 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14287 getPredicateForVector(DAG, DL, VT), RShOperand,
14288 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14289 }
14290
14291 if (VT.isScalableVector() ||
14292 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14293 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14295 return LowerToPredicatedOp(Op, DAG, Opc);
14296 }
14297
14298 // Right shift immediate
14299 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14300 unsigned Opc =
14301 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14302 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14303 DAG.getConstant(Cnt, DL, MVT::i32));
14304 }
14305
14306 // Right shift register. Note, there is not a shift right register
14307 // instruction, but the shift left register instruction takes a signed
14308 // value, where negative numbers specify a right shift.
14309 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14310 : Intrinsic::aarch64_neon_ushl;
14311 // negate the shift amount
14312 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14313 Op.getOperand(1));
14314 SDValue NegShiftLeft =
14316 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14317 NegShift);
14318 return NegShiftLeft;
14319 }
14320
14321 llvm_unreachable("unexpected shift opcode");
14322}
14323
14325 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14326 const SDLoc &dl, SelectionDAG &DAG) {
14327 EVT SrcVT = LHS.getValueType();
14328 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14329 "function only supposed to emit natural comparisons");
14330
14331 APInt SplatValue;
14332 APInt SplatUndef;
14333 unsigned SplatBitSize = 0;
14334 bool HasAnyUndefs;
14335
14336 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14337 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14338 SplatBitSize, HasAnyUndefs);
14339
14340 bool IsZero = IsCnst && SplatValue == 0;
14341 bool IsOne =
14342 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14343 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14344
14345 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14346 switch (CC) {
14347 default:
14348 return SDValue();
14349 case AArch64CC::NE: {
14350 SDValue Fcmeq;
14351 if (IsZero)
14352 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14353 else
14354 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14355 return DAG.getNOT(dl, Fcmeq, VT);
14356 }
14357 case AArch64CC::EQ:
14358 if (IsZero)
14359 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14360 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14361 case AArch64CC::GE:
14362 if (IsZero)
14363 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14364 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14365 case AArch64CC::GT:
14366 if (IsZero)
14367 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14368 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
14369 case AArch64CC::LE:
14370 if (!NoNans)
14371 return SDValue();
14372 // If we ignore NaNs then we can use to the LS implementation.
14373 [[fallthrough]];
14374 case AArch64CC::LS:
14375 if (IsZero)
14376 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
14377 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
14378 case AArch64CC::LT:
14379 if (!NoNans)
14380 return SDValue();
14381 // If we ignore NaNs then we can use to the MI implementation.
14382 [[fallthrough]];
14383 case AArch64CC::MI:
14384 if (IsZero)
14385 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
14386 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
14387 }
14388 }
14389
14390 switch (CC) {
14391 default:
14392 return SDValue();
14393 case AArch64CC::NE: {
14394 SDValue Cmeq;
14395 if (IsZero)
14396 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14397 else
14398 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14399 return DAG.getNOT(dl, Cmeq, VT);
14400 }
14401 case AArch64CC::EQ:
14402 if (IsZero)
14403 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
14404 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
14405 case AArch64CC::GE:
14406 if (IsZero)
14407 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
14408 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
14409 case AArch64CC::GT:
14410 if (IsZero)
14411 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
14412 if (IsMinusOne)
14413 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
14414 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
14415 case AArch64CC::LE:
14416 if (IsZero)
14417 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14418 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
14419 case AArch64CC::LS:
14420 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
14421 case AArch64CC::LO:
14422 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
14423 case AArch64CC::LT:
14424 if (IsZero)
14425 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
14426 if (IsOne)
14427 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
14428 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
14429 case AArch64CC::HI:
14430 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
14431 case AArch64CC::HS:
14432 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
14433 }
14434}
14435
14436SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
14437 SelectionDAG &DAG) const {
14438 if (Op.getValueType().isScalableVector())
14439 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
14440
14441 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14442 !Subtarget->isNeonAvailable()))
14443 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
14444
14445 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
14446 SDValue LHS = Op.getOperand(0);
14447 SDValue RHS = Op.getOperand(1);
14448 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
14449 SDLoc dl(Op);
14450
14451 if (LHS.getValueType().getVectorElementType().isInteger()) {
14452 assert(LHS.getValueType() == RHS.getValueType());
14454 SDValue Cmp =
14455 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
14456 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14457 }
14458
14459 // Lower isnan(x) | isnan(never-nan) to x != x.
14460 // Lower !isnan(x) & !isnan(never-nan) to x == x.
14461 if (CC == ISD::SETUO || CC == ISD::SETO) {
14462 bool OneNaN = false;
14463 if (LHS == RHS) {
14464 OneNaN = true;
14465 } else if (DAG.isKnownNeverNaN(RHS)) {
14466 OneNaN = true;
14467 RHS = LHS;
14468 } else if (DAG.isKnownNeverNaN(LHS)) {
14469 OneNaN = true;
14470 LHS = RHS;
14471 }
14472 if (OneNaN) {
14474 }
14475 }
14476
14477 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
14478
14479 // Make v4f16 (only) fcmp operations utilise vector instructions
14480 // v8f16 support will be a litle more complicated
14481 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
14482 LHS.getValueType().getVectorElementType() == MVT::bf16) {
14483 if (LHS.getValueType().getVectorNumElements() == 4) {
14484 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
14485 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
14486 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
14487 DAG.ReplaceAllUsesWith(Op, NewSetcc);
14488 CmpVT = MVT::v4i32;
14489 } else
14490 return SDValue();
14491 }
14492
14493 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
14494 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
14495 LHS.getValueType().getVectorElementType() != MVT::f128);
14496
14497 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
14498 // clean. Some of them require two branches to implement.
14499 AArch64CC::CondCode CC1, CC2;
14500 bool ShouldInvert;
14501 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
14502
14503 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
14504 SDValue Cmp =
14505 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
14506 if (!Cmp.getNode())
14507 return SDValue();
14508
14509 if (CC2 != AArch64CC::AL) {
14510 SDValue Cmp2 =
14511 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
14512 if (!Cmp2.getNode())
14513 return SDValue();
14514
14515 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
14516 }
14517
14518 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
14519
14520 if (ShouldInvert)
14521 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
14522
14523 return Cmp;
14524}
14525
14526static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
14527 SelectionDAG &DAG) {
14528 SDValue VecOp = ScalarOp.getOperand(0);
14529 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
14530 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
14531 DAG.getConstant(0, DL, MVT::i64));
14532}
14533
14534static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
14535 SDLoc DL, SelectionDAG &DAG) {
14536 unsigned ScalarOpcode;
14537 switch (Opcode) {
14538 case ISD::VECREDUCE_AND:
14539 ScalarOpcode = ISD::AND;
14540 break;
14541 case ISD::VECREDUCE_OR:
14542 ScalarOpcode = ISD::OR;
14543 break;
14544 case ISD::VECREDUCE_XOR:
14545 ScalarOpcode = ISD::XOR;
14546 break;
14547 default:
14548 llvm_unreachable("Expected bitwise vector reduction");
14549 return SDValue();
14550 }
14551
14552 EVT VecVT = Vec.getValueType();
14553 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
14554 "Expected power-of-2 length vector");
14555
14556 EVT ElemVT = VecVT.getVectorElementType();
14557
14558 SDValue Result;
14559 unsigned NumElems = VecVT.getVectorNumElements();
14560
14561 // Special case for boolean reductions
14562 if (ElemVT == MVT::i1) {
14563 // Split large vectors into smaller ones
14564 if (NumElems > 16) {
14565 SDValue Lo, Hi;
14566 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14567 EVT HalfVT = Lo.getValueType();
14568 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
14569 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
14570 }
14571
14572 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
14573 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
14574 // this element size leads to the best codegen, since e.g. setcc results
14575 // might need to be truncated otherwise.
14576 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
14577
14578 // any_ext doesn't work with umin/umax, so only use it for uadd.
14579 unsigned ExtendOp =
14580 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
14581 SDValue Extended = DAG.getNode(
14582 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
14583 switch (ScalarOpcode) {
14584 case ISD::AND:
14585 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
14586 break;
14587 case ISD::OR:
14588 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
14589 break;
14590 case ISD::XOR:
14591 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
14592 break;
14593 default:
14594 llvm_unreachable("Unexpected Opcode");
14595 }
14596
14597 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
14598 } else {
14599 // Iteratively split the vector in half and combine using the bitwise
14600 // operation until it fits in a 64 bit register.
14601 while (VecVT.getSizeInBits() > 64) {
14602 SDValue Lo, Hi;
14603 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
14604 VecVT = Lo.getValueType();
14605 NumElems = VecVT.getVectorNumElements();
14606 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
14607 }
14608
14609 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
14610
14611 // Do the remaining work on a scalar since it allows the code generator to
14612 // combine the shift and bitwise operation into one instruction and since
14613 // integer instructions can have higher throughput than vector instructions.
14614 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
14615
14616 // Iteratively combine the lower and upper halves of the scalar using the
14617 // bitwise operation, halving the relevant region of the scalar in each
14618 // iteration, until the relevant region is just one element of the original
14619 // vector.
14620 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
14621 SDValue ShiftAmount =
14622 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
14623 SDValue Shifted =
14624 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
14625 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
14626 }
14627
14628 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
14629 }
14630
14631 return DAG.getAnyExtOrTrunc(Result, DL, VT);
14632}
14633
14634SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
14635 SelectionDAG &DAG) const {
14636 SDValue Src = Op.getOperand(0);
14637
14638 // Try to lower fixed length reductions to SVE.
14639 EVT SrcVT = Src.getValueType();
14640 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
14641 Op.getOpcode() == ISD::VECREDUCE_AND ||
14642 Op.getOpcode() == ISD::VECREDUCE_OR ||
14643 Op.getOpcode() == ISD::VECREDUCE_XOR ||
14644 Op.getOpcode() == ISD::VECREDUCE_FADD ||
14645 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
14646 SrcVT.getVectorElementType() == MVT::i64);
14647 if (SrcVT.isScalableVector() ||
14649 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
14650
14651 if (SrcVT.getVectorElementType() == MVT::i1)
14652 return LowerPredReductionToSVE(Op, DAG);
14653
14654 switch (Op.getOpcode()) {
14655 case ISD::VECREDUCE_ADD:
14656 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
14657 case ISD::VECREDUCE_AND:
14658 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
14659 case ISD::VECREDUCE_OR:
14660 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
14662 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
14664 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
14666 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
14668 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
14669 case ISD::VECREDUCE_XOR:
14670 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
14672 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
14674 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
14676 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
14678 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
14680 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
14681 default:
14682 llvm_unreachable("Unhandled fixed length reduction");
14683 }
14684 }
14685
14686 // Lower NEON reductions.
14687 SDLoc dl(Op);
14688 switch (Op.getOpcode()) {
14689 case ISD::VECREDUCE_AND:
14690 case ISD::VECREDUCE_OR:
14691 case ISD::VECREDUCE_XOR:
14692 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
14693 Op.getValueType(), dl, DAG);
14694 case ISD::VECREDUCE_ADD:
14695 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
14697 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
14699 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
14701 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
14703 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
14704 default:
14705 llvm_unreachable("Unhandled reduction");
14706 }
14707}
14708
14709SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
14710 SelectionDAG &DAG) const {
14711 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14712 // No point replacing if we don't have the relevant instruction/libcall anyway
14713 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
14714 return SDValue();
14715
14716 // LSE has an atomic load-clear instruction, but not a load-and.
14717 SDLoc dl(Op);
14718 MVT VT = Op.getSimpleValueType();
14719 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
14720 SDValue RHS = Op.getOperand(2);
14721 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
14722 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
14723 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
14724 Op.getOperand(0), Op.getOperand(1), RHS,
14725 AN->getMemOperand());
14726}
14727
14728SDValue
14729AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
14730 SelectionDAG &DAG) const {
14731
14732 SDLoc dl(Op);
14733 // Get the inputs.
14734 SDNode *Node = Op.getNode();
14735 SDValue Chain = Op.getOperand(0);
14736 SDValue Size = Op.getOperand(1);
14738 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14739 EVT VT = Node->getValueType(0);
14740
14742 "no-stack-arg-probe")) {
14743 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14744 Chain = SP.getValue(1);
14745 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14746 if (Align)
14747 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14748 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14749 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14750 SDValue Ops[2] = {SP, Chain};
14751 return DAG.getMergeValues(Ops, dl);
14752 }
14753
14754 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
14755
14756 EVT PtrVT = getPointerTy(DAG.getDataLayout());
14758 PtrVT, 0);
14759
14760 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
14761 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
14762 if (Subtarget->hasCustomCallingConv())
14763 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
14764
14765 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
14766 DAG.getConstant(4, dl, MVT::i64));
14767 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
14768 Chain =
14769 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
14770 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
14771 DAG.getRegisterMask(Mask), Chain.getValue(1));
14772 // To match the actual intent better, we should read the output from X15 here
14773 // again (instead of potentially spilling it to the stack), but rereading Size
14774 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
14775 // here.
14776
14777 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
14778 DAG.getConstant(4, dl, MVT::i64));
14779
14780 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14781 Chain = SP.getValue(1);
14782 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14783 if (Align)
14784 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14785 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14786 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
14787
14788 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
14789
14790 SDValue Ops[2] = {SP, Chain};
14791 return DAG.getMergeValues(Ops, dl);
14792}
14793
14794SDValue
14795AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
14796 SelectionDAG &DAG) const {
14797 // Get the inputs.
14798 SDNode *Node = Op.getNode();
14799 SDValue Chain = Op.getOperand(0);
14800 SDValue Size = Op.getOperand(1);
14801
14803 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
14804 SDLoc dl(Op);
14805 EVT VT = Node->getValueType(0);
14806
14807 // Construct the new SP value in a GPR.
14808 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
14809 Chain = SP.getValue(1);
14810 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
14811 if (Align)
14812 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
14813 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
14814
14815 // Set the real SP to the new value with a probing loop.
14816 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
14817 SDValue Ops[2] = {SP, Chain};
14818 return DAG.getMergeValues(Ops, dl);
14819}
14820
14821SDValue
14822AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
14823 SelectionDAG &DAG) const {
14825
14826 if (Subtarget->isTargetWindows())
14827 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
14828 else if (hasInlineStackProbe(MF))
14829 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
14830 else
14831 return SDValue();
14832}
14833
14834// When x and y are extended, lower:
14835// avgfloor(x, y) -> (x + y) >> 1
14836// avgceil(x, y) -> (x + y + 1) >> 1
14837
14838// Otherwise, lower to:
14839// avgfloor(x, y) -> (x >> 1) + (y >> 1) + (x & y & 1)
14840// avgceil(x, y) -> (x >> 1) + (y >> 1) + ((x || y) & 1)
14841SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
14842 unsigned NewOp) const {
14843 if (Subtarget->hasSVE2())
14844 return LowerToPredicatedOp(Op, DAG, NewOp);
14845
14846 SDLoc dl(Op);
14847 SDValue OpA = Op->getOperand(0);
14848 SDValue OpB = Op->getOperand(1);
14849 EVT VT = Op.getValueType();
14850 bool IsCeil =
14851 (Op->getOpcode() == ISD::AVGCEILS || Op->getOpcode() == ISD::AVGCEILU);
14852 bool IsSigned =
14853 (Op->getOpcode() == ISD::AVGFLOORS || Op->getOpcode() == ISD::AVGCEILS);
14854 unsigned ShiftOpc = IsSigned ? ISD::SRA : ISD::SRL;
14855
14856 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
14857
14858 auto IsZeroExtended = [&DAG](SDValue &Node) {
14859 KnownBits Known = DAG.computeKnownBits(Node, 0);
14860 return Known.Zero.isSignBitSet();
14861 };
14862
14863 auto IsSignExtended = [&DAG](SDValue &Node) {
14864 return (DAG.ComputeNumSignBits(Node, 0) > 1);
14865 };
14866
14867 SDValue ConstantOne = DAG.getConstant(1, dl, VT);
14868 if ((!IsSigned && IsZeroExtended(OpA) && IsZeroExtended(OpB)) ||
14869 (IsSigned && IsSignExtended(OpA) && IsSignExtended(OpB))) {
14870 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, OpA, OpB);
14871 if (IsCeil)
14872 Add = DAG.getNode(ISD::ADD, dl, VT, Add, ConstantOne);
14873 return DAG.getNode(ShiftOpc, dl, VT, Add, ConstantOne);
14874 }
14875
14876 SDValue ShiftOpA = DAG.getNode(ShiftOpc, dl, VT, OpA, ConstantOne);
14877 SDValue ShiftOpB = DAG.getNode(ShiftOpc, dl, VT, OpB, ConstantOne);
14878
14879 SDValue tmp = DAG.getNode(IsCeil ? ISD::OR : ISD::AND, dl, VT, OpA, OpB);
14880 tmp = DAG.getNode(ISD::AND, dl, VT, tmp, ConstantOne);
14881 SDValue Add = DAG.getNode(ISD::ADD, dl, VT, ShiftOpA, ShiftOpB);
14882 return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
14883}
14884
14885SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
14886 SelectionDAG &DAG) const {
14887 EVT VT = Op.getValueType();
14888 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
14889
14890 SDLoc DL(Op);
14891 APInt MulImm = Op.getConstantOperandAPInt(0);
14892 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
14893 VT);
14894}
14895
14896/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
14897template <unsigned NumVecs>
14898static bool
14902 // Retrieve EC from first vector argument.
14903 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
14905#ifndef NDEBUG
14906 // Check the assumption that all input vectors are the same type.
14907 for (unsigned I = 0; I < NumVecs; ++I)
14908 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
14909 "Invalid type.");
14910#endif
14911 // memVT is `NumVecs * VT`.
14913 EC * NumVecs);
14914 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
14915 Info.offset = 0;
14916 Info.align.reset();
14918 return true;
14919}
14920
14921/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
14922/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
14923/// specified in the intrinsic calls.
14925 const CallInst &I,
14926 MachineFunction &MF,
14927 unsigned Intrinsic) const {
14928 auto &DL = I.getModule()->getDataLayout();
14929 switch (Intrinsic) {
14930 case Intrinsic::aarch64_sve_st2:
14931 return setInfoSVEStN<2>(*this, DL, Info, I);
14932 case Intrinsic::aarch64_sve_st3:
14933 return setInfoSVEStN<3>(*this, DL, Info, I);
14934 case Intrinsic::aarch64_sve_st4:
14935 return setInfoSVEStN<4>(*this, DL, Info, I);
14936 case Intrinsic::aarch64_neon_ld2:
14937 case Intrinsic::aarch64_neon_ld3:
14938 case Intrinsic::aarch64_neon_ld4:
14939 case Intrinsic::aarch64_neon_ld1x2:
14940 case Intrinsic::aarch64_neon_ld1x3:
14941 case Intrinsic::aarch64_neon_ld1x4: {
14943 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
14944 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14945 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14946 Info.offset = 0;
14947 Info.align.reset();
14948 // volatile loads with NEON intrinsics not supported
14950 return true;
14951 }
14952 case Intrinsic::aarch64_neon_ld2lane:
14953 case Intrinsic::aarch64_neon_ld3lane:
14954 case Intrinsic::aarch64_neon_ld4lane:
14955 case Intrinsic::aarch64_neon_ld2r:
14956 case Intrinsic::aarch64_neon_ld3r:
14957 case Intrinsic::aarch64_neon_ld4r: {
14959 // ldx return struct with the same vec type
14960 Type *RetTy = I.getType();
14961 auto *StructTy = cast<StructType>(RetTy);
14962 unsigned NumElts = StructTy->getNumElements();
14963 Type *VecTy = StructTy->getElementType(0);
14964 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
14965 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
14966 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14967 Info.offset = 0;
14968 Info.align.reset();
14969 // volatile loads with NEON intrinsics not supported
14971 return true;
14972 }
14973 case Intrinsic::aarch64_neon_st2:
14974 case Intrinsic::aarch64_neon_st3:
14975 case Intrinsic::aarch64_neon_st4:
14976 case Intrinsic::aarch64_neon_st1x2:
14977 case Intrinsic::aarch64_neon_st1x3:
14978 case Intrinsic::aarch64_neon_st1x4: {
14980 unsigned NumElts = 0;
14981 for (const Value *Arg : I.args()) {
14982 Type *ArgTy = Arg->getType();
14983 if (!ArgTy->isVectorTy())
14984 break;
14985 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
14986 }
14987 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
14988 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
14989 Info.offset = 0;
14990 Info.align.reset();
14991 // volatile stores with NEON intrinsics not supported
14993 return true;
14994 }
14995 case Intrinsic::aarch64_neon_st2lane:
14996 case Intrinsic::aarch64_neon_st3lane:
14997 case Intrinsic::aarch64_neon_st4lane: {
14999 unsigned NumElts = 0;
15000 // all the vector type is same
15001 Type *VecTy = I.getArgOperand(0)->getType();
15002 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15003
15004 for (const Value *Arg : I.args()) {
15005 Type *ArgTy = Arg->getType();
15006 if (!ArgTy->isVectorTy())
15007 break;
15008 NumElts += 1;
15009 }
15010
15011 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15012 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15013 Info.offset = 0;
15014 Info.align.reset();
15015 // volatile stores with NEON intrinsics not supported
15017 return true;
15018 }
15019 case Intrinsic::aarch64_ldaxr:
15020 case Intrinsic::aarch64_ldxr: {
15021 Type *ValTy = I.getParamElementType(0);
15023 Info.memVT = MVT::getVT(ValTy);
15024 Info.ptrVal = I.getArgOperand(0);
15025 Info.offset = 0;
15026 Info.align = DL.getABITypeAlign(ValTy);
15028 return true;
15029 }
15030 case Intrinsic::aarch64_stlxr:
15031 case Intrinsic::aarch64_stxr: {
15032 Type *ValTy = I.getParamElementType(1);
15034 Info.memVT = MVT::getVT(ValTy);
15035 Info.ptrVal = I.getArgOperand(1);
15036 Info.offset = 0;
15037 Info.align = DL.getABITypeAlign(ValTy);
15039 return true;
15040 }
15041 case Intrinsic::aarch64_ldaxp:
15042 case Intrinsic::aarch64_ldxp:
15044 Info.memVT = MVT::i128;
15045 Info.ptrVal = I.getArgOperand(0);
15046 Info.offset = 0;
15047 Info.align = Align(16);
15049 return true;
15050 case Intrinsic::aarch64_stlxp:
15051 case Intrinsic::aarch64_stxp:
15053 Info.memVT = MVT::i128;
15054 Info.ptrVal = I.getArgOperand(2);
15055 Info.offset = 0;
15056 Info.align = Align(16);
15058 return true;
15059 case Intrinsic::aarch64_sve_ldnt1: {
15060 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15062 Info.memVT = MVT::getVT(I.getType());
15063 Info.ptrVal = I.getArgOperand(1);
15064 Info.offset = 0;
15065 Info.align = DL.getABITypeAlign(ElTy);
15067 return true;
15068 }
15069 case Intrinsic::aarch64_sve_stnt1: {
15070 Type *ElTy =
15071 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15073 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15074 Info.ptrVal = I.getArgOperand(2);
15075 Info.offset = 0;
15076 Info.align = DL.getABITypeAlign(ElTy);
15078 return true;
15079 }
15080 case Intrinsic::aarch64_mops_memset_tag: {
15081 Value *Dst = I.getArgOperand(0);
15082 Value *Val = I.getArgOperand(1);
15084 Info.memVT = MVT::getVT(Val->getType());
15085 Info.ptrVal = Dst;
15086 Info.offset = 0;
15087 Info.align = I.getParamAlign(0).valueOrOne();
15089 // The size of the memory being operated on is unknown at this point
15091 return true;
15092 }
15093 default:
15094 break;
15095 }
15096
15097 return false;
15098}
15099
15101 ISD::LoadExtType ExtTy,
15102 EVT NewVT) const {
15103 // TODO: This may be worth removing. Check regression tests for diffs.
15104 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15105 return false;
15106
15107 // If we're reducing the load width in order to avoid having to use an extra
15108 // instruction to do extension then it's probably a good idea.
15109 if (ExtTy != ISD::NON_EXTLOAD)
15110 return true;
15111 // Don't reduce load width if it would prevent us from combining a shift into
15112 // the offset.
15113 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15114 assert(Mem);
15115 const SDValue &Base = Mem->getBasePtr();
15116 if (Base.getOpcode() == ISD::ADD &&
15117 Base.getOperand(1).getOpcode() == ISD::SHL &&
15118 Base.getOperand(1).hasOneUse() &&
15119 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15120 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15121 if (Mem->getMemoryVT().isScalableVector())
15122 return false;
15123 // The shift can be combined if it matches the size of the value being
15124 // loaded (and so reducing the width would make it not match).
15125 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15126 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15127 if (ShiftAmount == Log2_32(LoadBytes))
15128 return false;
15129 }
15130 // We have no reason to disallow reducing the load width, so allow it.
15131 return true;
15132}
15133
15134// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15136 EVT VT = Extend.getValueType();
15137 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15138 SDValue Extract = Extend.getOperand(0);
15139 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15140 Extract = Extract.getOperand(0);
15141 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15142 EVT VecVT = Extract.getOperand(0).getValueType();
15143 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15144 return false;
15145 }
15146 }
15147 return true;
15148}
15149
15150// Truncations from 64-bit GPR to 32-bit GPR is free.
15152 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15153 return false;
15154 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15155 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15156 return NumBits1 > NumBits2;
15157}
15159 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15160 return false;
15161 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15162 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15163 return NumBits1 > NumBits2;
15164}
15165
15166/// Check if it is profitable to hoist instruction in then/else to if.
15167/// Not profitable if I and it's user can form a FMA instruction
15168/// because we prefer FMSUB/FMADD.
15170 if (I->getOpcode() != Instruction::FMul)
15171 return true;
15172
15173 if (!I->hasOneUse())
15174 return true;
15175
15176 Instruction *User = I->user_back();
15177
15178 if (!(User->getOpcode() == Instruction::FSub ||
15179 User->getOpcode() == Instruction::FAdd))
15180 return true;
15181
15183 const Function *F = I->getFunction();
15184 const DataLayout &DL = F->getParent()->getDataLayout();
15185 Type *Ty = User->getOperand(0)->getType();
15186
15187 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15189 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15190 Options.UnsafeFPMath));
15191}
15192
15193// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15194// 64-bit GPR.
15196 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15197 return false;
15198 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15199 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15200 return NumBits1 == 32 && NumBits2 == 64;
15201}
15203 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15204 return false;
15205 unsigned NumBits1 = VT1.getSizeInBits();
15206 unsigned NumBits2 = VT2.getSizeInBits();
15207 return NumBits1 == 32 && NumBits2 == 64;
15208}
15209
15211 EVT VT1 = Val.getValueType();
15212 if (isZExtFree(VT1, VT2)) {
15213 return true;
15214 }
15215
15216 if (Val.getOpcode() != ISD::LOAD)
15217 return false;
15218
15219 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15220 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15221 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15222 VT1.getSizeInBits() <= 32);
15223}
15224
15225bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15226 if (isa<FPExtInst>(Ext))
15227 return false;
15228
15229 // Vector types are not free.
15230 if (Ext->getType()->isVectorTy())
15231 return false;
15232
15233 for (const Use &U : Ext->uses()) {
15234 // The extension is free if we can fold it with a left shift in an
15235 // addressing mode or an arithmetic operation: add, sub, and cmp.
15236
15237 // Is there a shift?
15238 const Instruction *Instr = cast<Instruction>(U.getUser());
15239
15240 // Is this a constant shift?
15241 switch (Instr->getOpcode()) {
15242 case Instruction::Shl:
15243 if (!isa<ConstantInt>(Instr->getOperand(1)))
15244 return false;
15245 break;
15246 case Instruction::GetElementPtr: {
15247 gep_type_iterator GTI = gep_type_begin(Instr);
15248 auto &DL = Ext->getModule()->getDataLayout();
15249 std::advance(GTI, U.getOperandNo()-1);
15250 Type *IdxTy = GTI.getIndexedType();
15251 // This extension will end up with a shift because of the scaling factor.
15252 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15253 // Get the shift amount based on the scaling factor:
15254 // log2(sizeof(IdxTy)) - log2(8).
15255 if (IdxTy->isScalableTy())
15256 return false;
15257 uint64_t ShiftAmt =
15258 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15259 3;
15260 // Is the constant foldable in the shift of the addressing mode?
15261 // I.e., shift amount is between 1 and 4 inclusive.
15262 if (ShiftAmt == 0 || ShiftAmt > 4)
15263 return false;
15264 break;
15265 }
15266 case Instruction::Trunc:
15267 // Check if this is a noop.
15268 // trunc(sext ty1 to ty2) to ty1.
15269 if (Instr->getType() == Ext->getOperand(0)->getType())
15270 continue;
15271 [[fallthrough]];
15272 default:
15273 return false;
15274 }
15275
15276 // At this point we can use the bfm family, so this extension is free
15277 // for that use.
15278 }
15279 return true;
15280}
15281
15282static bool isSplatShuffle(Value *V) {
15283 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15284 return all_equal(Shuf->getShuffleMask());
15285 return false;
15286}
15287
15288/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15289/// or upper half of the vector elements.
15290static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15291 bool AllowSplat = false) {
15292 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15293 auto *FullTy = FullV->getType();
15294 auto *HalfTy = HalfV->getType();
15295 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15296 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15297 };
15298
15299 auto extractHalf = [](Value *FullV, Value *HalfV) {
15300 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15301 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15302 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15303 };
15304
15305 ArrayRef<int> M1, M2;
15306 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15307 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15308 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15309 return false;
15310
15311 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15312 // it is not checked as an extract below.
15313 if (AllowSplat && isSplatShuffle(Op1))
15314 S1Op1 = nullptr;
15315 if (AllowSplat && isSplatShuffle(Op2))
15316 S2Op1 = nullptr;
15317
15318 // Check that the operands are half as wide as the result and we extract
15319 // half of the elements of the input vectors.
15320 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15321 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15322 return false;
15323
15324 // Check the mask extracts either the lower or upper half of vector
15325 // elements.
15326 int M1Start = 0;
15327 int M2Start = 0;
15328 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15329 if ((S1Op1 &&
15330 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15331 (S2Op1 &&
15332 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15333 return false;
15334
15335 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15336 (M2Start != 0 && M2Start != (NumElements / 2)))
15337 return false;
15338 if (S1Op1 && S2Op1 && M1Start != M2Start)
15339 return false;
15340
15341 return true;
15342}
15343
15344/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15345/// of the vector elements.
15346static bool areExtractExts(Value *Ext1, Value *Ext2) {
15347 auto areExtDoubled = [](Instruction *Ext) {
15348 return Ext->getType()->getScalarSizeInBits() ==
15349 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15350 };
15351
15352 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15353 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15354 !areExtDoubled(cast<Instruction>(Ext1)) ||
15355 !areExtDoubled(cast<Instruction>(Ext2)))
15356 return false;
15357
15358 return true;
15359}
15360
15361/// Check if Op could be used with vmull_high_p64 intrinsic.
15363 Value *VectorOperand = nullptr;
15364 ConstantInt *ElementIndex = nullptr;
15365 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15366 m_ConstantInt(ElementIndex))) &&
15367 ElementIndex->getValue() == 1 &&
15368 isa<FixedVectorType>(VectorOperand->getType()) &&
15369 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15370}
15371
15372/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15373static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15375}
15376
15378 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15379 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15380 if (!GEP || GEP->getNumOperands() != 2)
15381 return false;
15382
15383 Value *Base = GEP->getOperand(0);
15384 Value *Offsets = GEP->getOperand(1);
15385
15386 // We only care about scalar_base+vector_offsets.
15387 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15388 return false;
15389
15390 // Sink extends that would allow us to use 32-bit offset vectors.
15391 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15392 auto *OffsetsInst = cast<Instruction>(Offsets);
15393 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15394 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15395 Ops.push_back(&GEP->getOperandUse(1));
15396 }
15397
15398 // Sink the GEP.
15399 return true;
15400}
15401
15402/// We want to sink following cases:
15403/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
15405 if (match(Op, m_VScale()))
15406 return true;
15407 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15409 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
15410 return true;
15411 }
15412 return false;
15413}
15414
15415/// Check if sinking \p I's operands to I's basic block is profitable, because
15416/// the operands can be folded into a target instruction, e.g.
15417/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
15419 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
15420 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
15421 switch (II->getIntrinsicID()) {
15422 case Intrinsic::aarch64_neon_smull:
15423 case Intrinsic::aarch64_neon_umull:
15424 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
15425 /*AllowSplat=*/true)) {
15426 Ops.push_back(&II->getOperandUse(0));
15427 Ops.push_back(&II->getOperandUse(1));
15428 return true;
15429 }
15430 [[fallthrough]];
15431
15432 case Intrinsic::fma:
15433 if (isa<VectorType>(I->getType()) &&
15434 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
15435 !Subtarget->hasFullFP16())
15436 return false;
15437 [[fallthrough]];
15438 case Intrinsic::aarch64_neon_sqdmull:
15439 case Intrinsic::aarch64_neon_sqdmulh:
15440 case Intrinsic::aarch64_neon_sqrdmulh:
15441 // Sink splats for index lane variants
15442 if (isSplatShuffle(II->getOperand(0)))
15443 Ops.push_back(&II->getOperandUse(0));
15444 if (isSplatShuffle(II->getOperand(1)))
15445 Ops.push_back(&II->getOperandUse(1));
15446 return !Ops.empty();
15447 case Intrinsic::aarch64_neon_fmlal:
15448 case Intrinsic::aarch64_neon_fmlal2:
15449 case Intrinsic::aarch64_neon_fmlsl:
15450 case Intrinsic::aarch64_neon_fmlsl2:
15451 // Sink splats for index lane variants
15452 if (isSplatShuffle(II->getOperand(1)))
15453 Ops.push_back(&II->getOperandUse(1));
15454 if (isSplatShuffle(II->getOperand(2)))
15455 Ops.push_back(&II->getOperandUse(2));
15456 return !Ops.empty();
15457 case Intrinsic::aarch64_sve_ptest_first:
15458 case Intrinsic::aarch64_sve_ptest_last:
15459 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
15460 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
15461 Ops.push_back(&II->getOperandUse(0));
15462 return !Ops.empty();
15463 case Intrinsic::aarch64_sme_write_horiz:
15464 case Intrinsic::aarch64_sme_write_vert:
15465 case Intrinsic::aarch64_sme_writeq_horiz:
15466 case Intrinsic::aarch64_sme_writeq_vert: {
15467 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
15468 if (!Idx || Idx->getOpcode() != Instruction::Add)
15469 return false;
15470 Ops.push_back(&II->getOperandUse(1));
15471 return true;
15472 }
15473 case Intrinsic::aarch64_sme_read_horiz:
15474 case Intrinsic::aarch64_sme_read_vert:
15475 case Intrinsic::aarch64_sme_readq_horiz:
15476 case Intrinsic::aarch64_sme_readq_vert:
15477 case Intrinsic::aarch64_sme_ld1b_vert:
15478 case Intrinsic::aarch64_sme_ld1h_vert:
15479 case Intrinsic::aarch64_sme_ld1w_vert:
15480 case Intrinsic::aarch64_sme_ld1d_vert:
15481 case Intrinsic::aarch64_sme_ld1q_vert:
15482 case Intrinsic::aarch64_sme_st1b_vert:
15483 case Intrinsic::aarch64_sme_st1h_vert:
15484 case Intrinsic::aarch64_sme_st1w_vert:
15485 case Intrinsic::aarch64_sme_st1d_vert:
15486 case Intrinsic::aarch64_sme_st1q_vert:
15487 case Intrinsic::aarch64_sme_ld1b_horiz:
15488 case Intrinsic::aarch64_sme_ld1h_horiz:
15489 case Intrinsic::aarch64_sme_ld1w_horiz:
15490 case Intrinsic::aarch64_sme_ld1d_horiz:
15491 case Intrinsic::aarch64_sme_ld1q_horiz:
15492 case Intrinsic::aarch64_sme_st1b_horiz:
15493 case Intrinsic::aarch64_sme_st1h_horiz:
15494 case Intrinsic::aarch64_sme_st1w_horiz:
15495 case Intrinsic::aarch64_sme_st1d_horiz:
15496 case Intrinsic::aarch64_sme_st1q_horiz: {
15497 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
15498 if (!Idx || Idx->getOpcode() != Instruction::Add)
15499 return false;
15500 Ops.push_back(&II->getOperandUse(3));
15501 return true;
15502 }
15503 case Intrinsic::aarch64_neon_pmull:
15504 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
15505 return false;
15506 Ops.push_back(&II->getOperandUse(0));
15507 Ops.push_back(&II->getOperandUse(1));
15508 return true;
15509 case Intrinsic::aarch64_neon_pmull64:
15510 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
15511 II->getArgOperand(1)))
15512 return false;
15513 Ops.push_back(&II->getArgOperandUse(0));
15514 Ops.push_back(&II->getArgOperandUse(1));
15515 return true;
15516 case Intrinsic::masked_gather:
15517 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
15518 return false;
15519 Ops.push_back(&II->getArgOperandUse(0));
15520 return true;
15521 case Intrinsic::masked_scatter:
15522 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
15523 return false;
15524 Ops.push_back(&II->getArgOperandUse(1));
15525 return true;
15526 default:
15527 return false;
15528 }
15529 }
15530
15531 // Sink vscales closer to uses for better isel
15532 switch (I->getOpcode()) {
15533 case Instruction::GetElementPtr:
15534 case Instruction::Add:
15535 case Instruction::Sub:
15536 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
15537 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
15538 Ops.push_back(&I->getOperandUse(Op));
15539 return true;
15540 }
15541 }
15542 break;
15543 default:
15544 break;
15545 }
15546
15547 if (!I->getType()->isVectorTy())
15548 return false;
15549
15550 switch (I->getOpcode()) {
15551 case Instruction::Sub:
15552 case Instruction::Add: {
15553 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
15554 return false;
15555
15556 // If the exts' operands extract either the lower or upper elements, we
15557 // can sink them too.
15558 auto Ext1 = cast<Instruction>(I->getOperand(0));
15559 auto Ext2 = cast<Instruction>(I->getOperand(1));
15560 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
15561 Ops.push_back(&Ext1->getOperandUse(0));
15562 Ops.push_back(&Ext2->getOperandUse(0));
15563 }
15564
15565 Ops.push_back(&I->getOperandUse(0));
15566 Ops.push_back(&I->getOperandUse(1));
15567
15568 return true;
15569 }
15570 case Instruction::Or: {
15571 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
15572 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
15573 if (Subtarget->hasNEON()) {
15574 Instruction *OtherAnd, *IA, *IB;
15575 Value *MaskValue;
15576 // MainAnd refers to And instruction that has 'Not' as one of its operands
15577 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
15578 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
15579 m_Instruction(IA)))))) {
15580 if (match(OtherAnd,
15581 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
15582 Instruction *MainAnd = I->getOperand(0) == OtherAnd
15583 ? cast<Instruction>(I->getOperand(1))
15584 : cast<Instruction>(I->getOperand(0));
15585
15586 // Both Ands should be in same basic block as Or
15587 if (I->getParent() != MainAnd->getParent() ||
15588 I->getParent() != OtherAnd->getParent())
15589 return false;
15590
15591 // Non-mask operands of both Ands should also be in same basic block
15592 if (I->getParent() != IA->getParent() ||
15593 I->getParent() != IB->getParent())
15594 return false;
15595
15596 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
15597 Ops.push_back(&I->getOperandUse(0));
15598 Ops.push_back(&I->getOperandUse(1));
15599
15600 return true;
15601 }
15602 }
15603 }
15604
15605 return false;
15606 }
15607 case Instruction::Mul: {
15608 int NumZExts = 0, NumSExts = 0;
15609 for (auto &Op : I->operands()) {
15610 // Make sure we are not already sinking this operand
15611 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
15612 continue;
15613
15614 if (match(&Op, m_SExt(m_Value()))) {
15615 NumSExts++;
15616 continue;
15617 } else if (match(&Op, m_ZExt(m_Value()))) {
15618 NumZExts++;
15619 continue;
15620 }
15621
15622 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
15623
15624 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
15625 // operand and the s/zext can help create indexed s/umull. This is
15626 // especially useful to prevent i64 mul being scalarized.
15627 if (Shuffle && isSplatShuffle(Shuffle) &&
15628 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
15629 Ops.push_back(&Shuffle->getOperandUse(0));
15630 Ops.push_back(&Op);
15631 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
15632 NumSExts++;
15633 else
15634 NumZExts++;
15635 continue;
15636 }
15637
15638 if (!Shuffle)
15639 continue;
15640
15641 Value *ShuffleOperand = Shuffle->getOperand(0);
15642 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
15643 if (!Insert)
15644 continue;
15645
15646 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
15647 if (!OperandInstr)
15648 continue;
15649
15650 ConstantInt *ElementConstant =
15651 dyn_cast<ConstantInt>(Insert->getOperand(2));
15652 // Check that the insertelement is inserting into element 0
15653 if (!ElementConstant || !ElementConstant->isZero())
15654 continue;
15655
15656 unsigned Opcode = OperandInstr->getOpcode();
15657 if (Opcode == Instruction::SExt)
15658 NumSExts++;
15659 else if (Opcode == Instruction::ZExt)
15660 NumZExts++;
15661 else {
15662 // If we find that the top bits are known 0, then we can sink and allow
15663 // the backend to generate a umull.
15664 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
15665 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
15666 const DataLayout &DL = I->getFunction()->getParent()->getDataLayout();
15667 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
15668 continue;
15669 NumZExts++;
15670 }
15671
15672 Ops.push_back(&Shuffle->getOperandUse(0));
15673 Ops.push_back(&Op);
15674 }
15675
15676 // Is it profitable to sink if we found two of the same type of extends.
15677 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
15678 }
15679 default:
15680 return false;
15681 }
15682 return false;
15683}
15684
15686 bool IsLittleEndian) {
15687 Value *Op = ZExt->getOperand(0);
15688 auto *SrcTy = cast<FixedVectorType>(Op->getType());
15689 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15690 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15691 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
15692 return false;
15693
15694 assert(DstWidth % SrcWidth == 0 &&
15695 "TBL lowering is not supported for a ZExt instruction with this "
15696 "source & destination element type.");
15697 unsigned ZExtFactor = DstWidth / SrcWidth;
15698 unsigned NumElts = SrcTy->getNumElements();
15699 IRBuilder<> Builder(ZExt);
15700 SmallVector<int> Mask;
15701 // Create a mask that selects <0,...,Op[i]> for each lane of the destination
15702 // vector to replace the original ZExt. This can later be lowered to a set of
15703 // tbl instructions.
15704 for (unsigned i = 0; i < NumElts * ZExtFactor; i++) {
15705 if (IsLittleEndian) {
15706 if (i % ZExtFactor == 0)
15707 Mask.push_back(i / ZExtFactor);
15708 else
15709 Mask.push_back(NumElts);
15710 } else {
15711 if ((i + 1) % ZExtFactor == 0)
15712 Mask.push_back((i - ZExtFactor + 1) / ZExtFactor);
15713 else
15714 Mask.push_back(NumElts);
15715 }
15716 }
15717
15718 auto *FirstEltZero = Builder.CreateInsertElement(
15719 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
15720 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
15721 Result = Builder.CreateBitCast(Result, DstTy);
15722 if (DstTy != ZExt->getType())
15723 Result = Builder.CreateZExt(Result, ZExt->getType());
15724 ZExt->replaceAllUsesWith(Result);
15725 ZExt->eraseFromParent();
15726 return true;
15727}
15728
15729static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
15730 IRBuilder<> Builder(TI);
15732 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
15733 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
15734 auto *DstTy = cast<FixedVectorType>(TI->getType());
15735 assert(SrcTy->getElementType()->isIntegerTy() &&
15736 "Non-integer type source vector element is not supported");
15737 assert(DstTy->getElementType()->isIntegerTy(8) &&
15738 "Unsupported destination vector element type");
15739 unsigned SrcElemTySz =
15740 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
15741 unsigned DstElemTySz =
15742 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
15743 assert((SrcElemTySz % DstElemTySz == 0) &&
15744 "Cannot lower truncate to tbl instructions for a source element size "
15745 "that is not divisible by the destination element size");
15746 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
15747 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
15748 "Unsupported source vector element type size");
15749 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
15750
15751 // Create a mask to choose every nth byte from the source vector table of
15752 // bytes to create the truncated destination vector, where 'n' is the truncate
15753 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
15754 // 0,8,16,..Y*8th bytes for the little-endian format
15756 for (int Itr = 0; Itr < 16; Itr++) {
15757 if (Itr < NumElements)
15758 MaskConst.push_back(Builder.getInt8(
15759 IsLittleEndian ? Itr * TruncFactor
15760 : Itr * TruncFactor + (TruncFactor - 1)));
15761 else
15762 MaskConst.push_back(Builder.getInt8(255));
15763 }
15764
15765 int MaxTblSz = 128 * 4;
15766 int MaxSrcSz = SrcElemTySz * NumElements;
15767 int ElemsPerTbl =
15768 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
15769 assert(ElemsPerTbl <= 16 &&
15770 "Maximum elements selected using TBL instruction cannot exceed 16!");
15771
15772 int ShuffleCount = 128 / SrcElemTySz;
15773 SmallVector<int> ShuffleLanes;
15774 for (int i = 0; i < ShuffleCount; ++i)
15775 ShuffleLanes.push_back(i);
15776
15777 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
15778 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
15779 // call TBL & save the result in a vector of TBL results for combining later.
15781 while (ShuffleLanes.back() < NumElements) {
15782 Parts.push_back(Builder.CreateBitCast(
15783 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
15784
15785 if (Parts.size() == 4) {
15787 Intrinsic::aarch64_neon_tbl4, VecTy);
15788 Parts.push_back(ConstantVector::get(MaskConst));
15789 Results.push_back(Builder.CreateCall(F, Parts));
15790 Parts.clear();
15791 }
15792
15793 for (int i = 0; i < ShuffleCount; ++i)
15794 ShuffleLanes[i] += ShuffleCount;
15795 }
15796
15797 assert((Parts.empty() || Results.empty()) &&
15798 "Lowering trunc for vectors requiring different TBL instructions is "
15799 "not supported!");
15800 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
15801 // registers
15802 if (!Parts.empty()) {
15803 Intrinsic::ID TblID;
15804 switch (Parts.size()) {
15805 case 1:
15806 TblID = Intrinsic::aarch64_neon_tbl1;
15807 break;
15808 case 2:
15809 TblID = Intrinsic::aarch64_neon_tbl2;
15810 break;
15811 case 3:
15812 TblID = Intrinsic::aarch64_neon_tbl3;
15813 break;
15814 }
15815
15816 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
15817 Parts.push_back(ConstantVector::get(MaskConst));
15818 Results.push_back(Builder.CreateCall(F, Parts));
15819 }
15820
15821 // Extract the destination vector from TBL result(s) after combining them
15822 // where applicable. Currently, at most two TBLs are supported.
15823 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
15824 "more than 2 tbl instructions!");
15825 Value *FinalResult = Results[0];
15826 if (Results.size() == 1) {
15827 if (ElemsPerTbl < 16) {
15828 SmallVector<int> FinalMask(ElemsPerTbl);
15829 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15830 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
15831 }
15832 } else {
15833 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
15834 if (ElemsPerTbl < 16) {
15835 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
15836 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
15837 } else {
15838 std::iota(FinalMask.begin(), FinalMask.end(), 0);
15839 }
15840 FinalResult =
15841 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
15842 }
15843
15844 TI->replaceAllUsesWith(FinalResult);
15845 TI->eraseFromParent();
15846}
15847
15849 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
15850 // shuffle_vector instructions are serialized when targeting SVE,
15851 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
15852 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
15853 return false;
15854
15855 // Try to optimize conversions using tbl. This requires materializing constant
15856 // index vectors, which can increase code size and add loads. Skip the
15857 // transform unless the conversion is in a loop block guaranteed to execute
15858 // and we are not optimizing for size.
15859 Function *F = I->getParent()->getParent();
15860 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
15861 F->hasOptSize())
15862 return false;
15863
15864 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
15865 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
15866 if (!SrcTy || !DstTy)
15867 return false;
15868
15869 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
15870 // lowered to tbl instructions to insert the original i8 elements
15871 // into i8x lanes. This is enabled for cases where it is beneficial.
15872 auto *ZExt = dyn_cast<ZExtInst>(I);
15873 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
15874 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
15875 if (DstWidth % 8 != 0)
15876 return false;
15877
15878 auto *TruncDstType =
15879 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
15880 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
15881 // the remaining ZExt folded into the user, don't use tbl lowering.
15882 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
15883 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
15886 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
15887 return false;
15888
15889 DstTy = TruncDstType;
15890 }
15891
15892 return createTblShuffleForZExt(ZExt, DstTy, Subtarget->isLittleEndian());
15893 }
15894
15895 auto *UIToFP = dyn_cast<UIToFPInst>(I);
15896 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
15897 DstTy->getElementType()->isFloatTy()) {
15898 IRBuilder<> Builder(I);
15899 auto *ZExt = cast<ZExtInst>(
15900 Builder.CreateZExt(I->getOperand(0), VectorType::getInteger(DstTy)));
15901 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
15902 I->replaceAllUsesWith(UI);
15903 I->eraseFromParent();
15904 return createTblShuffleForZExt(ZExt, cast<FixedVectorType>(ZExt->getType()),
15905 Subtarget->isLittleEndian());
15906 }
15907
15908 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
15909 // followed by a truncate lowered to using tbl.4.
15910 auto *FPToUI = dyn_cast<FPToUIInst>(I);
15911 if (FPToUI &&
15912 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
15913 SrcTy->getElementType()->isFloatTy() &&
15914 DstTy->getElementType()->isIntegerTy(8)) {
15915 IRBuilder<> Builder(I);
15916 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
15917 VectorType::getInteger(SrcTy));
15918 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
15919 I->replaceAllUsesWith(TruncI);
15920 I->eraseFromParent();
15921 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
15922 return true;
15923 }
15924
15925 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
15926 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
15927 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
15928 // registers
15929 auto *TI = dyn_cast<TruncInst>(I);
15930 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
15931 ((SrcTy->getElementType()->isIntegerTy(32) ||
15932 SrcTy->getElementType()->isIntegerTy(64)) &&
15933 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
15934 createTblForTrunc(TI, Subtarget->isLittleEndian());
15935 return true;
15936 }
15937
15938 return false;
15939}
15940
15942 Align &RequiredAligment) const {
15943 if (!LoadedType.isSimple() ||
15944 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
15945 return false;
15946 // Cyclone supports unaligned accesses.
15947 RequiredAligment = Align(1);
15948 unsigned NumBits = LoadedType.getSizeInBits();
15949 return NumBits == 32 || NumBits == 64;
15950}
15951
15952/// A helper function for determining the number of interleaved accesses we
15953/// will generate when lowering accesses of the given type.
15955 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
15956 unsigned VecSize = 128;
15957 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15958 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
15959 if (UseScalable && isa<FixedVectorType>(VecTy))
15960 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
15961 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
15962}
15963
15966 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
15967 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
15968 return MOStridedAccess;
15970}
15971
15973 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
15974 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
15975 auto EC = VecTy->getElementCount();
15976 unsigned MinElts = EC.getKnownMinValue();
15977
15978 UseScalable = false;
15979
15980 if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() &&
15981 !Subtarget->useSVEForFixedLengthVectors())
15982 return false;
15983
15984 if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
15985 return false;
15986
15987 // Ensure that the predicate for this number of elements is available.
15988 if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts))
15989 return false;
15990
15991 // Ensure the number of vector elements is greater than 1.
15992 if (MinElts < 2)
15993 return false;
15994
15995 // Ensure the element type is legal.
15996 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
15997 return false;
15998
15999 if (EC.isScalable()) {
16000 UseScalable = true;
16001 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
16002 }
16003
16004 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16005 if (Subtarget->useSVEForFixedLengthVectors()) {
16006 unsigned MinSVEVectorSize =
16007 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16008 if (VecSize % MinSVEVectorSize == 0 ||
16009 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
16010 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
16011 UseScalable = true;
16012 return true;
16013 }
16014 }
16015
16016 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16017 // 128 will be split into multiple interleaved accesses.
16018 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
16019}
16020
16022 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16023 return ScalableVectorType::get(VTy->getElementType(), 2);
16024
16025 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16026 return ScalableVectorType::get(VTy->getElementType(), 4);
16027
16028 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16029 return ScalableVectorType::get(VTy->getElementType(), 8);
16030
16031 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16032 return ScalableVectorType::get(VTy->getElementType(), 8);
16033
16034 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16035 return ScalableVectorType::get(VTy->getElementType(), 2);
16036
16037 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16038 return ScalableVectorType::get(VTy->getElementType(), 4);
16039
16040 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16041 return ScalableVectorType::get(VTy->getElementType(), 8);
16042
16043 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16044 return ScalableVectorType::get(VTy->getElementType(), 16);
16045
16046 llvm_unreachable("Cannot handle input vector type");
16047}
16048
16049static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16050 bool Scalable, Type *LDVTy,
16051 Type *PtrTy) {
16052 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16053 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16054 Intrinsic::aarch64_sve_ld3_sret,
16055 Intrinsic::aarch64_sve_ld4_sret};
16056 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16057 Intrinsic::aarch64_neon_ld3,
16058 Intrinsic::aarch64_neon_ld4};
16059 if (Scalable)
16060 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16061
16062 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16063}
16064
16065static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16066 bool Scalable, Type *STVTy,
16067 Type *PtrTy) {
16068 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16069 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16070 Intrinsic::aarch64_sve_st3,
16071 Intrinsic::aarch64_sve_st4};
16072 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16073 Intrinsic::aarch64_neon_st3,
16074 Intrinsic::aarch64_neon_st4};
16075 if (Scalable)
16076 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16077
16078 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16079}
16080
16081/// Lower an interleaved load into a ldN intrinsic.
16082///
16083/// E.g. Lower an interleaved load (Factor = 2):
16084/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16085/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16086/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16087///
16088/// Into:
16089/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16090/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16091/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16094 ArrayRef<unsigned> Indices, unsigned Factor) const {
16095 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16096 "Invalid interleave factor");
16097 assert(!Shuffles.empty() && "Empty shufflevector input");
16098 assert(Shuffles.size() == Indices.size() &&
16099 "Unmatched number of shufflevectors and indices");
16100
16101 const DataLayout &DL = LI->getModule()->getDataLayout();
16102
16103 VectorType *VTy = Shuffles[0]->getType();
16104
16105 // Skip if we do not have NEON and skip illegal vector types. We can
16106 // "legalize" wide vector types into multiple interleaved accesses as long as
16107 // the vector types are divisible by 128.
16108 bool UseScalable;
16109 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16110 return false;
16111
16112 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16113
16114 auto *FVTy = cast<FixedVectorType>(VTy);
16115
16116 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16117 // load integer vectors first and then convert to pointer vectors.
16118 Type *EltTy = FVTy->getElementType();
16119 if (EltTy->isPointerTy())
16120 FVTy =
16121 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16122
16123 // If we're going to generate more than one load, reset the sub-vector type
16124 // to something legal.
16125 FVTy = FixedVectorType::get(FVTy->getElementType(),
16126 FVTy->getNumElements() / NumLoads);
16127
16128 auto *LDVTy =
16129 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16130
16131 IRBuilder<> Builder(LI);
16132
16133 // The base address of the load.
16134 Value *BaseAddr = LI->getPointerOperand();
16135
16136 Type *PtrTy = LI->getPointerOperandType();
16137 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16138 LDVTy->getElementCount());
16139
16140 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16141 UseScalable, LDVTy, PtrTy);
16142
16143 // Holds sub-vectors extracted from the load intrinsic return values. The
16144 // sub-vectors are associated with the shufflevector instructions they will
16145 // replace.
16147
16148 Value *PTrue = nullptr;
16149 if (UseScalable) {
16150 std::optional<unsigned> PgPattern =
16151 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16152 if (Subtarget->getMinSVEVectorSizeInBits() ==
16153 Subtarget->getMaxSVEVectorSizeInBits() &&
16154 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16155 PgPattern = AArch64SVEPredPattern::all;
16156
16157 auto *PTruePat =
16158 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16159 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16160 {PTruePat});
16161 }
16162
16163 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16164
16165 // If we're generating more than one load, compute the base address of
16166 // subsequent loads as an offset from the previous.
16167 if (LoadCount > 0)
16168 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16169 FVTy->getNumElements() * Factor);
16170
16171 CallInst *LdN;
16172 if (UseScalable)
16173 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16174 else
16175 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16176
16177 // Extract and store the sub-vectors returned by the load intrinsic.
16178 for (unsigned i = 0; i < Shuffles.size(); i++) {
16179 ShuffleVectorInst *SVI = Shuffles[i];
16180 unsigned Index = Indices[i];
16181
16182 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16183
16184 if (UseScalable)
16185 SubVec = Builder.CreateExtractVector(
16186 FVTy, SubVec,
16187 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16188
16189 // Convert the integer vector to pointer vector if the element is pointer.
16190 if (EltTy->isPointerTy())
16191 SubVec = Builder.CreateIntToPtr(
16193 FVTy->getNumElements()));
16194
16195 SubVecs[SVI].push_back(SubVec);
16196 }
16197 }
16198
16199 // Replace uses of the shufflevector instructions with the sub-vectors
16200 // returned by the load intrinsic. If a shufflevector instruction is
16201 // associated with more than one sub-vector, those sub-vectors will be
16202 // concatenated into a single wide vector.
16203 for (ShuffleVectorInst *SVI : Shuffles) {
16204 auto &SubVec = SubVecs[SVI];
16205 auto *WideVec =
16206 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16207 SVI->replaceAllUsesWith(WideVec);
16208 }
16209
16210 return true;
16211}
16212
16213template <typename Iter>
16214bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16215 int MaxLookupDist = 20;
16216 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16217 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16218 const Value *PtrA1 =
16219 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16220
16221 while (++It != End) {
16222 if (It->isDebugOrPseudoInst())
16223 continue;
16224 if (MaxLookupDist-- == 0)
16225 break;
16226 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16227 const Value *PtrB1 =
16228 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16229 DL, OffsetB);
16230 if (PtrA1 == PtrB1 &&
16231 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16232 .abs() == 16)
16233 return true;
16234 }
16235 }
16236
16237 return false;
16238}
16239
16240/// Lower an interleaved store into a stN intrinsic.
16241///
16242/// E.g. Lower an interleaved store (Factor = 3):
16243/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16244/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16245/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16246///
16247/// Into:
16248/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16249/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16250/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16251/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16252///
16253/// Note that the new shufflevectors will be removed and we'll only generate one
16254/// st3 instruction in CodeGen.
16255///
16256/// Example for a more general valid mask (Factor 3). Lower:
16257/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16258/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16259/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16260///
16261/// Into:
16262/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16263/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16264/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16265/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16267 ShuffleVectorInst *SVI,
16268 unsigned Factor) const {
16269
16270 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16271 "Invalid interleave factor");
16272
16273 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16274 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16275
16276 unsigned LaneLen = VecTy->getNumElements() / Factor;
16277 Type *EltTy = VecTy->getElementType();
16278 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16279
16280 const DataLayout &DL = SI->getModule()->getDataLayout();
16281 bool UseScalable;
16282
16283 // Skip if we do not have NEON and skip illegal vector types. We can
16284 // "legalize" wide vector types into multiple interleaved accesses as long as
16285 // the vector types are divisible by 128.
16286 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16287 return false;
16288
16289 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16290
16291 Value *Op0 = SVI->getOperand(0);
16292 Value *Op1 = SVI->getOperand(1);
16293 IRBuilder<> Builder(SI);
16294
16295 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16296 // vectors to integer vectors.
16297 if (EltTy->isPointerTy()) {
16298 Type *IntTy = DL.getIntPtrType(EltTy);
16299 unsigned NumOpElts =
16300 cast<FixedVectorType>(Op0->getType())->getNumElements();
16301
16302 // Convert to the corresponding integer vector.
16303 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16304 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16305 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16306
16307 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16308 }
16309
16310 // If we're going to generate more than one store, reset the lane length
16311 // and sub-vector type to something legal.
16312 LaneLen /= NumStores;
16313 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16314
16315 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16316 : SubVecTy;
16317
16318 // The base address of the store.
16319 Value *BaseAddr = SI->getPointerOperand();
16320
16321 auto Mask = SVI->getShuffleMask();
16322
16323 // Sanity check if all the indices are NOT in range.
16324 // If mask is `poison`, `Mask` may be a vector of -1s.
16325 // If all of them are `poison`, OOB read will happen later.
16326 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16327 return false;
16328 }
16329 // A 64bit st2 which does not start at element 0 will involved adding extra
16330 // ext elements making the st2 unprofitable, and if there is a nearby store
16331 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16332 // zip;ldp pair which has higher throughput.
16333 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16334 (Mask[0] != 0 ||
16335 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16336 DL) ||
16337 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16338 BaseAddr, DL)))
16339 return false;
16340
16341 Type *PtrTy = SI->getPointerOperandType();
16342 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16343 STVTy->getElementCount());
16344
16345 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16346 UseScalable, STVTy, PtrTy);
16347
16348 Value *PTrue = nullptr;
16349 if (UseScalable) {
16350 std::optional<unsigned> PgPattern =
16351 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16352 if (Subtarget->getMinSVEVectorSizeInBits() ==
16353 Subtarget->getMaxSVEVectorSizeInBits() &&
16354 Subtarget->getMinSVEVectorSizeInBits() ==
16355 DL.getTypeSizeInBits(SubVecTy))
16356 PgPattern = AArch64SVEPredPattern::all;
16357
16358 auto *PTruePat =
16359 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16360 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16361 {PTruePat});
16362 }
16363
16364 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
16365
16367
16368 // Split the shufflevector operands into sub vectors for the new stN call.
16369 for (unsigned i = 0; i < Factor; i++) {
16370 Value *Shuffle;
16371 unsigned IdxI = StoreCount * LaneLen * Factor + i;
16372 if (Mask[IdxI] >= 0) {
16373 Shuffle = Builder.CreateShuffleVector(
16374 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
16375 } else {
16376 unsigned StartMask = 0;
16377 for (unsigned j = 1; j < LaneLen; j++) {
16378 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
16379 if (Mask[IdxJ] >= 0) {
16380 StartMask = Mask[IdxJ] - j;
16381 break;
16382 }
16383 }
16384 // Note: Filling undef gaps with random elements is ok, since
16385 // those elements were being written anyway (with undefs).
16386 // In the case of all undefs we're defaulting to using elems from 0
16387 // Note: StartMask cannot be negative, it's checked in
16388 // isReInterleaveMask
16389 Shuffle = Builder.CreateShuffleVector(
16390 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
16391 }
16392
16393 if (UseScalable)
16394 Shuffle = Builder.CreateInsertVector(
16395 STVTy, UndefValue::get(STVTy), Shuffle,
16396 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
16397
16398 Ops.push_back(Shuffle);
16399 }
16400
16401 if (UseScalable)
16402 Ops.push_back(PTrue);
16403
16404 // If we generating more than one store, we compute the base address of
16405 // subsequent stores as an offset from the previous.
16406 if (StoreCount > 0)
16407 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
16408 BaseAddr, LaneLen * Factor);
16409
16410 Ops.push_back(BaseAddr);
16411 Builder.CreateCall(StNFunc, Ops);
16412 }
16413 return true;
16414}
16415
16417 IntrinsicInst *DI, LoadInst *LI) const {
16418 // Only deinterleave2 supported at present.
16419 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
16420 return false;
16421
16422 // Only a factor of 2 supported at present.
16423 const unsigned Factor = 2;
16424
16425 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16426 const DataLayout &DL = DI->getModule()->getDataLayout();
16427 bool UseScalable;
16428 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16429 return false;
16430
16431 // TODO: Add support for using SVE instructions with fixed types later, using
16432 // the code from lowerInterleavedLoad to obtain the correct container type.
16433 if (UseScalable && !VTy->isScalableTy())
16434 return false;
16435
16436 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16437
16438 VectorType *LdTy =
16440 VTy->getElementCount().divideCoefficientBy(NumLoads));
16441
16442 Type *PtrTy = LI->getPointerOperandType();
16443 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
16444 UseScalable, LdTy, PtrTy);
16445
16446 IRBuilder<> Builder(LI);
16447
16448 Value *Pred = nullptr;
16449 if (UseScalable)
16450 Pred =
16451 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
16452
16453 Value *BaseAddr = LI->getPointerOperand();
16454 Value *Result;
16455 if (NumLoads > 1) {
16456 Value *Left = PoisonValue::get(VTy);
16458
16459 for (unsigned I = 0; I < NumLoads; ++I) {
16460 Value *Offset = Builder.getInt64(I * Factor);
16461
16462 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
16463 Value *LdN = nullptr;
16464 if (UseScalable)
16465 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16466 else
16467 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16468
16469 Value *Idx =
16470 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16471 Left = Builder.CreateInsertVector(
16472 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16473 Right = Builder.CreateInsertVector(
16474 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16475 }
16476
16477 Result = PoisonValue::get(DI->getType());
16478 Result = Builder.CreateInsertValue(Result, Left, 0);
16479 Result = Builder.CreateInsertValue(Result, Right, 1);
16480 } else {
16481 if (UseScalable)
16482 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16483 else
16484 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16485 }
16486
16487 DI->replaceAllUsesWith(Result);
16488 return true;
16489}
16490
16492 IntrinsicInst *II, StoreInst *SI) const {
16493 // Only interleave2 supported at present.
16494 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
16495 return false;
16496
16497 // Only a factor of 2 supported at present.
16498 const unsigned Factor = 2;
16499
16500 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
16501 const DataLayout &DL = II->getModule()->getDataLayout();
16502 bool UseScalable;
16503 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16504 return false;
16505
16506 // TODO: Add support for using SVE instructions with fixed types later, using
16507 // the code from lowerInterleavedStore to obtain the correct container type.
16508 if (UseScalable && !VTy->isScalableTy())
16509 return false;
16510
16511 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
16512
16513 VectorType *StTy =
16515 VTy->getElementCount().divideCoefficientBy(NumStores));
16516
16517 Type *PtrTy = SI->getPointerOperandType();
16518 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16519 UseScalable, StTy, PtrTy);
16520
16521 IRBuilder<> Builder(SI);
16522
16523 Value *BaseAddr = SI->getPointerOperand();
16524 Value *Pred = nullptr;
16525
16526 if (UseScalable)
16527 Pred =
16528 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16529
16530 Value *L = II->getOperand(0);
16531 Value *R = II->getOperand(1);
16532
16533 for (unsigned I = 0; I < NumStores; ++I) {
16534 Value *Address = BaseAddr;
16535 if (NumStores > 1) {
16536 Value *Offset = Builder.getInt64(I * Factor);
16537 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16538
16539 Value *Idx =
16540 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16541 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16542 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16543 }
16544
16545 if (UseScalable)
16546 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
16547 else
16548 Builder.CreateCall(StNFunc, {L, R, Address});
16549 }
16550
16551 return true;
16552}
16553
16555 const MemOp &Op, const AttributeList &FuncAttributes) const {
16556 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16557 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16558 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16559 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16560 // taken one instruction to materialize the v2i64 zero and one store (with
16561 // restrictive addressing mode). Just do i64 stores.
16562 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16563 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16564 if (Op.isAligned(AlignCheck))
16565 return true;
16566 unsigned Fast;
16567 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16569 Fast;
16570 };
16571
16572 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16573 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
16574 return MVT::v16i8;
16575 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16576 return MVT::f128;
16577 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16578 return MVT::i64;
16579 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16580 return MVT::i32;
16581 return MVT::Other;
16582}
16583
16585 const MemOp &Op, const AttributeList &FuncAttributes) const {
16586 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
16587 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
16588 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
16589 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
16590 // taken one instruction to materialize the v2i64 zero and one store (with
16591 // restrictive addressing mode). Just do i64 stores.
16592 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
16593 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
16594 if (Op.isAligned(AlignCheck))
16595 return true;
16596 unsigned Fast;
16597 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
16599 Fast;
16600 };
16601
16602 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
16603 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
16604 return LLT::fixed_vector(2, 64);
16605 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
16606 return LLT::scalar(128);
16607 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
16608 return LLT::scalar(64);
16609 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
16610 return LLT::scalar(32);
16611 return LLT();
16612}
16613
16614// 12-bit optionally shifted immediates are legal for adds.
16616 if (Immed == std::numeric_limits<int64_t>::min()) {
16617 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
16618 << ": avoid UB for INT64_MIN\n");
16619 return false;
16620 }
16621 // Same encoding for add/sub, just flip the sign.
16622 Immed = std::abs(Immed);
16623 bool IsLegal = ((Immed >> 12) == 0 ||
16624 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
16625 LLVM_DEBUG(dbgs() << "Is " << Immed
16626 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
16627 return IsLegal;
16628}
16629
16631 // We will only emit addvl/inc* instructions for SVE2
16632 if (!Subtarget->hasSVE2())
16633 return false;
16634
16635 // addvl's immediates are in terms of the number of bytes in a register.
16636 // Since there are 16 in the base supported size (128bits), we need to
16637 // divide the immediate by that much to give us a useful immediate to
16638 // multiply by vscale. We can't have a remainder as a result of this.
16639 if (Imm % 16 == 0)
16640 return isInt<6>(Imm / 16);
16641
16642 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
16643 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
16644 // of addvl as a result, so only take h|w|d into account.
16645 // Dec[h|w|d] will cover subtractions.
16646 // Immediates are in the range [1,16], so we can't do a 2's complement check.
16647 // FIXME: Can we make use of other patterns to cover other immediates?
16648
16649 // inch|dech
16650 if (Imm % 8 == 0)
16651 return std::abs(Imm / 8) <= 16;
16652 // incw|decw
16653 if (Imm % 4 == 0)
16654 return std::abs(Imm / 4) <= 16;
16655 // incd|decd
16656 if (Imm % 2 == 0)
16657 return std::abs(Imm / 2) <= 16;
16658
16659 return false;
16660}
16661
16662// Return false to prevent folding
16663// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
16664// if the folding leads to worse code.
16666 SDValue AddNode, SDValue ConstNode) const {
16667 // Let the DAGCombiner decide for vector types and large types.
16668 const EVT VT = AddNode.getValueType();
16669 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
16670 return true;
16671
16672 // It is worse if c1 is legal add immediate, while c1*c2 is not
16673 // and has to be composed by at least two instructions.
16674 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
16675 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
16676 const int64_t C1 = C1Node->getSExtValue();
16677 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
16679 return true;
16681 // Adapt to the width of a register.
16682 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
16684 if (Insn.size() > 1)
16685 return false;
16686
16687 // Default to true and let the DAGCombiner decide.
16688 return true;
16689}
16690
16691// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
16692// immediates is the same as for an add or a sub.
16694 return isLegalAddImmediate(Immed);
16695}
16696
16697/// isLegalAddressingMode - Return true if the addressing mode represented
16698/// by AM is legal for this target, for a load/store of the specified type.
16700 const AddrMode &AMode, Type *Ty,
16701 unsigned AS, Instruction *I) const {
16702 // AArch64 has five basic addressing modes:
16703 // reg
16704 // reg + 9-bit signed offset
16705 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
16706 // reg1 + reg2
16707 // reg + SIZE_IN_BYTES * reg
16708
16709 // No global is ever allowed as a base.
16710 if (AMode.BaseGV)
16711 return false;
16712
16713 // No reg+reg+imm addressing.
16714 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
16715 return false;
16716
16717 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
16718 // `2*ScaledReg` into `BaseReg + ScaledReg`
16719 AddrMode AM = AMode;
16720 if (AM.Scale && !AM.HasBaseReg) {
16721 if (AM.Scale == 1) {
16722 AM.HasBaseReg = true;
16723 AM.Scale = 0;
16724 } else if (AM.Scale == 2) {
16725 AM.HasBaseReg = true;
16726 AM.Scale = 1;
16727 } else {
16728 return false;
16729 }
16730 }
16731
16732 // A base register is required in all addressing modes.
16733 if (!AM.HasBaseReg)
16734 return false;
16735
16736 if (Ty->isScalableTy()) {
16737 if (isa<ScalableVectorType>(Ty)) {
16738 // See if we have a foldable vscale-based offset, for vector types which
16739 // are either legal or smaller than the minimum; more work will be
16740 // required if we need to consider addressing for types which need
16741 // legalization by splitting.
16742 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
16743 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
16744 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
16745 isPowerOf2_64(VecNumBytes))
16746 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
16747
16748 uint64_t VecElemNumBytes =
16749 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
16750 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
16751 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
16752 }
16753
16754 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
16755 }
16756
16757 // No scalable offsets allowed for non-scalable types.
16758 if (AM.ScalableOffset)
16759 return false;
16760
16761 // check reg + imm case:
16762 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
16763 uint64_t NumBytes = 0;
16764 if (Ty->isSized()) {
16765 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
16766 NumBytes = NumBits / 8;
16767 if (!isPowerOf2_64(NumBits))
16768 NumBytes = 0;
16769 }
16770
16771 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
16772 AM.Scale);
16773}
16774
16775// Check whether the 2 offsets belong to the same imm24 range, and their high
16776// 12bits are same, then their high part can be decoded with the offset of add.
16777int64_t
16779 int64_t MaxOffset) const {
16780 int64_t HighPart = MinOffset & ~0xfffULL;
16781 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
16782 // Rebase the value to an integer multiple of imm12.
16783 return HighPart;
16784 }
16785
16786 return 0;
16787}
16788
16790 // Consider splitting large offset of struct or array.
16791 return true;
16792}
16793
16795 const MachineFunction &MF, EVT VT) const {
16796 VT = VT.getScalarType();
16797
16798 if (!VT.isSimple())
16799 return false;
16800
16801 switch (VT.getSimpleVT().SimpleTy) {
16802 case MVT::f16:
16803 return Subtarget->hasFullFP16();
16804 case MVT::f32:
16805 case MVT::f64:
16806 return true;
16807 default:
16808 break;
16809 }
16810
16811 return false;
16812}
16813
16815 Type *Ty) const {
16816 switch (Ty->getScalarType()->getTypeID()) {
16817 case Type::FloatTyID:
16818 case Type::DoubleTyID:
16819 return true;
16820 default:
16821 return false;
16822 }
16823}
16824
16826 EVT VT, CodeGenOptLevel OptLevel) const {
16827 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
16829}
16830
16831const MCPhysReg *
16833 // LR is a callee-save register, but we must treat it as clobbered by any call
16834 // site. Hence we include LR in the scratch registers, which are in turn added
16835 // as implicit-defs for stackmaps and patchpoints.
16836 static const MCPhysReg ScratchRegs[] = {
16837 AArch64::X16, AArch64::X17, AArch64::LR, 0
16838 };
16839 return ScratchRegs;
16840}
16841
16843 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
16844 return RCRegs;
16845}
16846
16847bool
16849 CombineLevel Level) const {
16850 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
16851 N->getOpcode() == ISD::SRL) &&
16852 "Expected shift op");
16853
16854 SDValue ShiftLHS = N->getOperand(0);
16855 EVT VT = N->getValueType(0);
16856
16857 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
16858 // combine it with shift 'N' to let it be lowered to UBFX except:
16859 // ((x >> C) & mask) << C.
16860 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
16861 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
16862 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
16863 if (isMask_64(TruncMask)) {
16864 SDValue AndLHS = ShiftLHS.getOperand(0);
16865 if (AndLHS.getOpcode() == ISD::SRL) {
16866 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
16867 if (N->getOpcode() == ISD::SHL)
16868 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
16869 return SRLC->getZExtValue() == SHLC->getZExtValue();
16870 return false;
16871 }
16872 }
16873 }
16874 }
16875 return true;
16876}
16877
16879 const SDNode *N) const {
16880 assert(N->getOpcode() == ISD::XOR &&
16881 (N->getOperand(0).getOpcode() == ISD::SHL ||
16882 N->getOperand(0).getOpcode() == ISD::SRL) &&
16883 "Expected XOR(SHIFT) pattern");
16884
16885 // Only commute if the entire NOT mask is a hidden shifted mask.
16886 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
16887 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16888 if (XorC && ShiftC) {
16889 unsigned MaskIdx, MaskLen;
16890 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
16891 unsigned ShiftAmt = ShiftC->getZExtValue();
16892 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
16893 if (N->getOperand(0).getOpcode() == ISD::SHL)
16894 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
16895 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
16896 }
16897 }
16898
16899 return false;
16900}
16901
16903 const SDNode *N, CombineLevel Level) const {
16904 assert(((N->getOpcode() == ISD::SHL &&
16905 N->getOperand(0).getOpcode() == ISD::SRL) ||
16906 (N->getOpcode() == ISD::SRL &&
16907 N->getOperand(0).getOpcode() == ISD::SHL)) &&
16908 "Expected shift-shift mask");
16909 // Don't allow multiuse shift folding with the same shift amount.
16910 if (!N->getOperand(0)->hasOneUse())
16911 return false;
16912
16913 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
16914 EVT VT = N->getValueType(0);
16915 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
16916 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
16917 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
16918 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
16919 }
16920
16921 return true;
16922}
16923
16925 unsigned BinOpcode, EVT VT) const {
16926 return VT.isScalableVector() && isTypeLegal(VT);
16927}
16928
16930 Type *Ty) const {
16931 assert(Ty->isIntegerTy());
16932
16933 unsigned BitSize = Ty->getPrimitiveSizeInBits();
16934 if (BitSize == 0)
16935 return false;
16936
16937 int64_t Val = Imm.getSExtValue();
16938 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
16939 return true;
16940
16941 if ((int64_t)Val < 0)
16942 Val = ~Val;
16943 if (BitSize == 32)
16944 Val &= (1LL << 32) - 1;
16945
16946 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
16947 // MOVZ is free so return true for one or fewer MOVK.
16948 return Shift < 3;
16949}
16950
16952 unsigned Index) const {
16954 return false;
16955
16956 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
16957}
16958
16959/// Turn vector tests of the signbit in the form of:
16960/// xor (sra X, elt_size(X)-1), -1
16961/// into:
16962/// cmge X, X, #0
16964 const AArch64Subtarget *Subtarget) {
16965 EVT VT = N->getValueType(0);
16966 if (!Subtarget->hasNEON() || !VT.isVector())
16967 return SDValue();
16968
16969 // There must be a shift right algebraic before the xor, and the xor must be a
16970 // 'not' operation.
16971 SDValue Shift = N->getOperand(0);
16972 SDValue Ones = N->getOperand(1);
16973 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
16975 return SDValue();
16976
16977 // The shift should be smearing the sign bit across each vector element.
16978 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
16979 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
16980 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
16981 return SDValue();
16982
16983 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
16984}
16985
16986// Given a vecreduce_add node, detect the below pattern and convert it to the
16987// node sequence with UABDL, [S|U]ADB and UADDLP.
16988//
16989// i32 vecreduce_add(
16990// v16i32 abs(
16991// v16i32 sub(
16992// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
16993// =================>
16994// i32 vecreduce_add(
16995// v4i32 UADDLP(
16996// v8i16 add(
16997// v8i16 zext(
16998// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
16999// v8i16 zext(
17000// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
17002 SelectionDAG &DAG) {
17003 // Assumed i32 vecreduce_add
17004 if (N->getValueType(0) != MVT::i32)
17005 return SDValue();
17006
17007 SDValue VecReduceOp0 = N->getOperand(0);
17008 unsigned Opcode = VecReduceOp0.getOpcode();
17009 // Assumed v16i32 abs
17010 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
17011 return SDValue();
17012
17013 SDValue ABS = VecReduceOp0;
17014 // Assumed v16i32 sub
17015 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
17016 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
17017 return SDValue();
17018
17019 SDValue SUB = ABS->getOperand(0);
17020 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
17021 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
17022 // Assumed v16i32 type
17023 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
17024 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
17025 return SDValue();
17026
17027 // Assumed zext or sext
17028 bool IsZExt = false;
17029 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17030 IsZExt = true;
17031 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17032 IsZExt = false;
17033 } else
17034 return SDValue();
17035
17036 SDValue EXT0 = SUB->getOperand(0);
17037 SDValue EXT1 = SUB->getOperand(1);
17038 // Assumed zext's operand has v16i8 type
17039 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
17040 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
17041 return SDValue();
17042
17043 // Pattern is dectected. Let's convert it to sequence of nodes.
17044 SDLoc DL(N);
17045
17046 // First, create the node pattern of UABD/SABD.
17047 SDValue UABDHigh8Op0 =
17048 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17049 DAG.getConstant(8, DL, MVT::i64));
17050 SDValue UABDHigh8Op1 =
17051 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17052 DAG.getConstant(8, DL, MVT::i64));
17053 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17054 UABDHigh8Op0, UABDHigh8Op1);
17055 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
17056
17057 // Second, create the node pattern of UABAL.
17058 SDValue UABDLo8Op0 =
17059 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17060 DAG.getConstant(0, DL, MVT::i64));
17061 SDValue UABDLo8Op1 =
17062 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17063 DAG.getConstant(0, DL, MVT::i64));
17064 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17065 UABDLo8Op0, UABDLo8Op1);
17066 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17067 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17068
17069 // Third, create the node of UADDLP.
17070 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17071
17072 // Fourth, create the node of VECREDUCE_ADD.
17073 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17074}
17075
17076// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17077// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17078// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17079// If we have vectors larger than v16i8 we extract v16i8 vectors,
17080// Follow the same steps above to get DOT instructions concatenate them
17081// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17083 const AArch64Subtarget *ST) {
17084 if (!ST->hasDotProd())
17086
17087 SDValue Op0 = N->getOperand(0);
17088 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17089 Op0.getValueType().getVectorElementType() != MVT::i32)
17090 return SDValue();
17091
17092 unsigned ExtOpcode = Op0.getOpcode();
17093 SDValue A = Op0;
17094 SDValue B;
17095 if (ExtOpcode == ISD::MUL) {
17096 A = Op0.getOperand(0);
17097 B = Op0.getOperand(1);
17098 if (A.getOpcode() != B.getOpcode() ||
17099 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17100 return SDValue();
17101 ExtOpcode = A.getOpcode();
17102 }
17103 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17104 return SDValue();
17105
17106 EVT Op0VT = A.getOperand(0).getValueType();
17107 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17108 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17109 if (!IsValidElementCount || !IsValidSize)
17110 return SDValue();
17111
17112 SDLoc DL(Op0);
17113 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17114 // the extend B.
17115 if (!B)
17116 B = DAG.getConstant(1, DL, Op0VT);
17117 else
17118 B = B.getOperand(0);
17119
17120 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17121 unsigned NumOfVecReduce;
17122 EVT TargetType;
17123 if (IsMultipleOf16) {
17124 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17125 TargetType = MVT::v4i32;
17126 } else {
17127 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17128 TargetType = MVT::v2i32;
17129 }
17130 auto DotOpcode =
17132 // Handle the case where we need to generate only one Dot operation.
17133 if (NumOfVecReduce == 1) {
17134 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17135 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17136 A.getOperand(0), B);
17137 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17138 }
17139 // Generate Dot instructions that are multiple of 16.
17140 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17141 SmallVector<SDValue, 4> SDotVec16;
17142 unsigned I = 0;
17143 for (; I < VecReduce16Num; I += 1) {
17144 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17145 SDValue Op0 =
17146 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17147 DAG.getConstant(I * 16, DL, MVT::i64));
17148 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17149 DAG.getConstant(I * 16, DL, MVT::i64));
17150 SDValue Dot =
17151 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17152 SDotVec16.push_back(Dot);
17153 }
17154 // Concatenate dot operations.
17155 EVT SDot16EVT =
17156 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17157 SDValue ConcatSDot16 =
17158 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17159 SDValue VecReduceAdd16 =
17160 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17161 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17162 if (VecReduce8Num == 0)
17163 return VecReduceAdd16;
17164
17165 // Generate the remainder Dot operation that is multiple of 8.
17166 SmallVector<SDValue, 4> SDotVec8;
17167 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17168 SDValue Vec8Op0 =
17169 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17170 DAG.getConstant(I * 16, DL, MVT::i64));
17171 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17172 DAG.getConstant(I * 16, DL, MVT::i64));
17173 SDValue Dot =
17174 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17175 SDValue VecReudceAdd8 =
17176 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17177 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17178 VecReudceAdd8);
17179}
17180
17181// Given an (integer) vecreduce, we know the order of the inputs does not
17182// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17183// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17184// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17186 auto DetectAddExtract = [&](SDValue A) {
17187 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17188 // UADDLP(x) if found.
17189 assert(A.getOpcode() == ISD::ADD);
17190 EVT VT = A.getValueType();
17191 SDValue Op0 = A.getOperand(0);
17192 SDValue Op1 = A.getOperand(1);
17193 if (Op0.getOpcode() != Op0.getOpcode() ||
17194 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17195 Op0.getOpcode() != ISD::SIGN_EXTEND))
17196 return SDValue();
17197 SDValue Ext0 = Op0.getOperand(0);
17198 SDValue Ext1 = Op1.getOperand(0);
17199 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17201 Ext0.getOperand(0) != Ext1.getOperand(0))
17202 return SDValue();
17203 // Check that the type is twice the add types, and the extract are from
17204 // upper/lower parts of the same source.
17206 VT.getVectorNumElements() * 2)
17207 return SDValue();
17208 if ((Ext0.getConstantOperandVal(1) != 0 ||
17210 (Ext1.getConstantOperandVal(1) != 0 ||
17212 return SDValue();
17213 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17215 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17216 };
17217
17218 if (SDValue R = DetectAddExtract(A))
17219 return R;
17220
17221 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17222 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17223 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17224 A.getOperand(1));
17225 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17226 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17227 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17228 A.getOperand(0));
17229 return SDValue();
17230}
17231
17232// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17233// UADDLV(concat), where the concat represents the 64-bit zext sources.
17235 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17236 // UADDLV(concat(zext, zext)) if found.
17237 assert(A.getOpcode() == ISD::ADD);
17238 EVT VT = A.getValueType();
17239 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17240 return SDValue();
17241 SDValue Op0 = A.getOperand(0);
17242 SDValue Op1 = A.getOperand(1);
17243 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17244 return SDValue();
17245 SDValue Ext0 = Op0.getOperand(0);
17246 SDValue Ext1 = Op1.getOperand(0);
17247 EVT ExtVT0 = Ext0.getValueType();
17248 EVT ExtVT1 = Ext1.getValueType();
17249 // Check zext VTs are the same and 64-bit length.
17250 if (ExtVT0 != ExtVT1 ||
17251 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17252 return SDValue();
17253 // Get VT for concat of zext sources.
17254 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17255 SDValue Concat =
17256 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17257
17258 switch (VT.getSimpleVT().SimpleTy) {
17259 case MVT::v2i64:
17260 case MVT::v4i32:
17261 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17262 case MVT::v8i16: {
17263 SDValue Uaddlv =
17264 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17265 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17266 }
17267 default:
17268 llvm_unreachable("Unhandled vector type");
17269 }
17270}
17271
17273 SDValue A = N->getOperand(0);
17274 if (A.getOpcode() == ISD::ADD) {
17275 if (SDValue R = performUADDVAddCombine(A, DAG))
17276 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17277 else if (SDValue R = performUADDVZextCombine(A, DAG))
17278 return R;
17279 }
17280 return SDValue();
17281}
17282
17285 const AArch64Subtarget *Subtarget) {
17286 if (DCI.isBeforeLegalizeOps())
17287 return SDValue();
17288
17289 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17290}
17291
17292SDValue
17293AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17294 SelectionDAG &DAG,
17295 SmallVectorImpl<SDNode *> &Created) const {
17297 if (isIntDivCheap(N->getValueType(0), Attr))
17298 return SDValue(N,0); // Lower SDIV as SDIV
17299
17300 EVT VT = N->getValueType(0);
17301
17302 // For scalable and fixed types, mark them as cheap so we can handle it much
17303 // later. This allows us to handle larger than legal types.
17304 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17305 return SDValue(N, 0);
17306
17307 // fold (sdiv X, pow2)
17308 if ((VT != MVT::i32 && VT != MVT::i64) ||
17309 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17310 return SDValue();
17311
17312 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17313}
17314
17315SDValue
17316AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17317 SelectionDAG &DAG,
17318 SmallVectorImpl<SDNode *> &Created) const {
17320 if (isIntDivCheap(N->getValueType(0), Attr))
17321 return SDValue(N, 0); // Lower SREM as SREM
17322
17323 EVT VT = N->getValueType(0);
17324
17325 // For scalable and fixed types, mark them as cheap so we can handle it much
17326 // later. This allows us to handle larger than legal types.
17327 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17328 return SDValue(N, 0);
17329
17330 // fold (srem X, pow2)
17331 if ((VT != MVT::i32 && VT != MVT::i64) ||
17332 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17333 return SDValue();
17334
17335 unsigned Lg2 = Divisor.countr_zero();
17336 if (Lg2 == 0)
17337 return SDValue();
17338
17339 SDLoc DL(N);
17340 SDValue N0 = N->getOperand(0);
17341 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17342 SDValue Zero = DAG.getConstant(0, DL, VT);
17343 SDValue CCVal, CSNeg;
17344 if (Lg2 == 1) {
17345 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17346 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17347 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17348
17349 Created.push_back(Cmp.getNode());
17350 Created.push_back(And.getNode());
17351 } else {
17352 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17353 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17354
17355 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
17356 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17357 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
17358 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
17359 Negs.getValue(1));
17360
17361 Created.push_back(Negs.getNode());
17362 Created.push_back(AndPos.getNode());
17363 Created.push_back(AndNeg.getNode());
17364 }
17365
17366 return CSNeg;
17367}
17368
17369static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
17370 switch(getIntrinsicID(S.getNode())) {
17371 default:
17372 break;
17373 case Intrinsic::aarch64_sve_cntb:
17374 return 8;
17375 case Intrinsic::aarch64_sve_cnth:
17376 return 16;
17377 case Intrinsic::aarch64_sve_cntw:
17378 return 32;
17379 case Intrinsic::aarch64_sve_cntd:
17380 return 64;
17381 }
17382 return {};
17383}
17384
17385/// Calculates what the pre-extend type is, based on the extension
17386/// operation node provided by \p Extend.
17387///
17388/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
17389/// pre-extend type is pulled directly from the operand, while other extend
17390/// operations need a bit more inspection to get this information.
17391///
17392/// \param Extend The SDNode from the DAG that represents the extend operation
17393///
17394/// \returns The type representing the \p Extend source type, or \p MVT::Other
17395/// if no valid type can be determined
17397 switch (Extend.getOpcode()) {
17398 case ISD::SIGN_EXTEND:
17399 case ISD::ZERO_EXTEND:
17400 return Extend.getOperand(0).getValueType();
17401 case ISD::AssertSext:
17402 case ISD::AssertZext:
17404 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
17405 if (!TypeNode)
17406 return MVT::Other;
17407 return TypeNode->getVT();
17408 }
17409 case ISD::AND: {
17411 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
17412 if (!Constant)
17413 return MVT::Other;
17414
17415 uint32_t Mask = Constant->getZExtValue();
17416
17417 if (Mask == UCHAR_MAX)
17418 return MVT::i8;
17419 else if (Mask == USHRT_MAX)
17420 return MVT::i16;
17421 else if (Mask == UINT_MAX)
17422 return MVT::i32;
17423
17424 return MVT::Other;
17425 }
17426 default:
17427 return MVT::Other;
17428 }
17429}
17430
17431/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
17432/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
17433/// SExt/ZExt rather than the scalar SExt/ZExt
17435 EVT VT = BV.getValueType();
17436 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
17438 return SDValue();
17439
17440 // Use the first item in the buildvector/shuffle to get the size of the
17441 // extend, and make sure it looks valid.
17442 SDValue Extend = BV->getOperand(0);
17443 unsigned ExtendOpcode = Extend.getOpcode();
17444 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
17445 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
17446 ExtendOpcode == ISD::AssertSext;
17447 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
17448 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
17449 return SDValue();
17450 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
17451 // calculatePreExtendType will work without issue.
17452 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
17453 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
17454 return SDValue();
17455
17456 // Restrict valid pre-extend data type
17457 EVT PreExtendType = calculatePreExtendType(Extend);
17458 if (PreExtendType == MVT::Other ||
17459 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
17460 return SDValue();
17461
17462 // Make sure all other operands are equally extended
17463 for (SDValue Op : drop_begin(BV->ops())) {
17464 if (Op.isUndef())
17465 continue;
17466 unsigned Opc = Op.getOpcode();
17467 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
17468 Opc == ISD::AssertSext;
17469 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
17470 return SDValue();
17471 }
17472
17473 SDValue NBV;
17474 SDLoc DL(BV);
17475 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
17476 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
17477 EVT PreExtendLegalType =
17478 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
17480 for (SDValue Op : BV->ops())
17481 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
17482 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
17483 PreExtendLegalType));
17484 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
17485 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
17486 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
17487 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
17488 BV.getOperand(1).isUndef()
17489 ? DAG.getUNDEF(PreExtendVT)
17490 : BV.getOperand(1).getOperand(0),
17491 cast<ShuffleVectorSDNode>(BV)->getMask());
17492 }
17493 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
17494}
17495
17496/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
17497/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
17499 // If the value type isn't a vector, none of the operands are going to be dups
17500 EVT VT = Mul->getValueType(0);
17501 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17502 return SDValue();
17503
17504 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
17505 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
17506
17507 // Neither operands have been changed, don't make any further changes
17508 if (!Op0 && !Op1)
17509 return SDValue();
17510
17511 SDLoc DL(Mul);
17512 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
17513 Op1 ? Op1 : Mul->getOperand(1));
17514}
17515
17516// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
17517// Same for other types with equivalent constants.
17519 EVT VT = N->getValueType(0);
17520 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
17521 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
17522 return SDValue();
17523 if (N->getOperand(0).getOpcode() != ISD::AND ||
17524 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
17525 return SDValue();
17526
17527 SDValue And = N->getOperand(0);
17528 SDValue Srl = And.getOperand(0);
17529
17530 APInt V1, V2, V3;
17531 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
17532 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
17534 return SDValue();
17535
17536 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
17537 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
17538 V3 != (HalfSize - 1))
17539 return SDValue();
17540
17541 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
17542 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
17543 VT.getVectorElementCount() * 2);
17544
17545 SDLoc DL(N);
17546 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
17547 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
17548 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
17549}
17550
17553 const AArch64Subtarget *Subtarget) {
17554
17555 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
17556 return Ext;
17558 return Ext;
17559
17560 if (DCI.isBeforeLegalizeOps())
17561 return SDValue();
17562
17563 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
17564 // and in MachineCombiner pass, add+mul will be combined into madd.
17565 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
17566 SDLoc DL(N);
17567 EVT VT = N->getValueType(0);
17568 SDValue N0 = N->getOperand(0);
17569 SDValue N1 = N->getOperand(1);
17570 SDValue MulOper;
17571 unsigned AddSubOpc;
17572
17573 auto IsAddSubWith1 = [&](SDValue V) -> bool {
17574 AddSubOpc = V->getOpcode();
17575 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
17576 SDValue Opnd = V->getOperand(1);
17577 MulOper = V->getOperand(0);
17578 if (AddSubOpc == ISD::SUB)
17579 std::swap(Opnd, MulOper);
17580 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
17581 return C->isOne();
17582 }
17583 return false;
17584 };
17585
17586 if (IsAddSubWith1(N0)) {
17587 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
17588 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
17589 }
17590
17591 if (IsAddSubWith1(N1)) {
17592 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
17593 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
17594 }
17595
17596 // The below optimizations require a constant RHS.
17597 if (!isa<ConstantSDNode>(N1))
17598 return SDValue();
17599
17600 ConstantSDNode *C = cast<ConstantSDNode>(N1);
17601 const APInt &ConstValue = C->getAPIntValue();
17602
17603 // Allow the scaling to be folded into the `cnt` instruction by preventing
17604 // the scaling to be obscured here. This makes it easier to pattern match.
17605 if (IsSVECntIntrinsic(N0) ||
17606 (N0->getOpcode() == ISD::TRUNCATE &&
17607 (IsSVECntIntrinsic(N0->getOperand(0)))))
17608 if (ConstValue.sge(1) && ConstValue.sle(16))
17609 return SDValue();
17610
17611 // Multiplication of a power of two plus/minus one can be done more
17612 // cheaply as shift+add/sub. For now, this is true unilaterally. If
17613 // future CPUs have a cheaper MADD instruction, this may need to be
17614 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
17615 // 64-bit is 5 cycles, so this is always a win.
17616 // More aggressively, some multiplications N0 * C can be lowered to
17617 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
17618 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
17619 // TODO: lower more cases.
17620
17621 // TrailingZeroes is used to test if the mul can be lowered to
17622 // shift+add+shift.
17623 unsigned TrailingZeroes = ConstValue.countr_zero();
17624 if (TrailingZeroes) {
17625 // Conservatively do not lower to shift+add+shift if the mul might be
17626 // folded into smul or umul.
17627 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
17628 isZeroExtended(N0, DAG)))
17629 return SDValue();
17630 // Conservatively do not lower to shift+add+shift if the mul might be
17631 // folded into madd or msub.
17632 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
17633 N->use_begin()->getOpcode() == ISD::SUB))
17634 return SDValue();
17635 }
17636 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
17637 // and shift+add+shift.
17638 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
17639 unsigned ShiftAmt;
17640
17641 auto Shl = [&](SDValue N0, unsigned N1) {
17642 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
17643 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
17644 };
17645 auto Add = [&](SDValue N0, SDValue N1) {
17646 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
17647 };
17648 auto Sub = [&](SDValue N0, SDValue N1) {
17649 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
17650 };
17651 auto Negate = [&](SDValue N) {
17652 SDValue Zero = DAG.getConstant(0, DL, VT);
17653 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
17654 };
17655
17656 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
17657 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
17658 // the (2^N - 1) can't be execused via a single instruction.
17659 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
17660 unsigned BitWidth = C.getBitWidth();
17661 for (unsigned i = 1; i < BitWidth / 2; i++) {
17662 APInt Rem;
17663 APInt X(BitWidth, (1 << i) + 1);
17664 APInt::sdivrem(C, X, N, Rem);
17665 APInt NVMinus1 = N - 1;
17666 if (Rem == 0 && NVMinus1.isPowerOf2()) {
17667 M = X;
17668 return true;
17669 }
17670 }
17671 return false;
17672 };
17673
17674 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
17675 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
17676 // the (2^N - 1) can't be execused via a single instruction.
17677 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
17678 APInt CVMinus1 = C - 1;
17679 if (CVMinus1.isNegative())
17680 return false;
17681 unsigned TrailingZeroes = CVMinus1.countr_zero();
17682 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
17683 if (SCVMinus1.isPowerOf2()) {
17684 unsigned BitWidth = SCVMinus1.getBitWidth();
17685 M = APInt(BitWidth, SCVMinus1.logBase2());
17686 N = APInt(BitWidth, TrailingZeroes);
17687 return true;
17688 }
17689 return false;
17690 };
17691
17692 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
17693 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
17694 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
17695 APInt CVMinus1 = C - 1;
17696 if (CVMinus1.isNegative())
17697 return false;
17698 unsigned TrailingZeroes = CVMinus1.countr_zero();
17699 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
17700 if (CVPlus1.isPowerOf2()) {
17701 unsigned BitWidth = CVPlus1.getBitWidth();
17702 M = APInt(BitWidth, CVPlus1.logBase2());
17703 N = APInt(BitWidth, TrailingZeroes);
17704 return true;
17705 }
17706 return false;
17707 };
17708
17709 if (ConstValue.isNonNegative()) {
17710 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
17711 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17712 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
17713 // (mul x, (2^M + 1) * (2^N + 1))
17714 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
17715 // (mul x, (2^M + 1) * 2^N + 1))
17716 // => MV = add (shl x, M), x); add (shl MV, N), x)
17717 // (mul x, 1 - (1 - 2^M) * 2^N))
17718 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
17719 APInt SCVMinus1 = ShiftedConstValue - 1;
17720 APInt SCVPlus1 = ShiftedConstValue + 1;
17721 APInt CVPlus1 = ConstValue + 1;
17722 APInt CVM, CVN;
17723 if (SCVMinus1.isPowerOf2()) {
17724 ShiftAmt = SCVMinus1.logBase2();
17725 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
17726 } else if (CVPlus1.isPowerOf2()) {
17727 ShiftAmt = CVPlus1.logBase2();
17728 return Sub(Shl(N0, ShiftAmt), N0);
17729 } else if (SCVPlus1.isPowerOf2()) {
17730 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17731 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
17732 }
17733 if (Subtarget->hasALULSLFast() &&
17734 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
17735 APInt CVMMinus1 = CVM - 1;
17736 APInt CVNMinus1 = CVN - 1;
17737 unsigned ShiftM1 = CVMMinus1.logBase2();
17738 unsigned ShiftN1 = CVNMinus1.logBase2();
17739 // ALULSLFast implicate that Shifts <= 4 places are fast
17740 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
17741 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
17742 return Add(Shl(MVal, ShiftN1), MVal);
17743 }
17744 }
17745 if (Subtarget->hasALULSLFast() &&
17746 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
17747 unsigned ShiftM = CVM.getZExtValue();
17748 unsigned ShiftN = CVN.getZExtValue();
17749 // ALULSLFast implicate that Shifts <= 4 places are fast
17750 if (ShiftM <= 4 && ShiftN <= 4) {
17751 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
17752 return Add(Shl(MVal, CVN.getZExtValue()), N0);
17753 }
17754 }
17755
17756 if (Subtarget->hasALULSLFast() &&
17757 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
17758 unsigned ShiftM = CVM.getZExtValue();
17759 unsigned ShiftN = CVN.getZExtValue();
17760 // ALULSLFast implicate that Shifts <= 4 places are fast
17761 if (ShiftM <= 4 && ShiftN <= 4) {
17762 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
17763 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
17764 }
17765 }
17766 } else {
17767 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17768 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
17769 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
17770 APInt SCVPlus1 = -ShiftedConstValue + 1;
17771 APInt CVNegPlus1 = -ConstValue + 1;
17772 APInt CVNegMinus1 = -ConstValue - 1;
17773 if (CVNegPlus1.isPowerOf2()) {
17774 ShiftAmt = CVNegPlus1.logBase2();
17775 return Sub(N0, Shl(N0, ShiftAmt));
17776 } else if (CVNegMinus1.isPowerOf2()) {
17777 ShiftAmt = CVNegMinus1.logBase2();
17778 return Negate(Add(Shl(N0, ShiftAmt), N0));
17779 } else if (SCVPlus1.isPowerOf2()) {
17780 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
17781 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
17782 }
17783 }
17784
17785 return SDValue();
17786}
17787
17789 SelectionDAG &DAG) {
17790 // Take advantage of vector comparisons producing 0 or -1 in each lane to
17791 // optimize away operation when it's from a constant.
17792 //
17793 // The general transformation is:
17794 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
17795 // AND(VECTOR_CMP(x,y), constant2)
17796 // constant2 = UNARYOP(constant)
17797
17798 // Early exit if this isn't a vector operation, the operand of the
17799 // unary operation isn't a bitwise AND, or if the sizes of the operations
17800 // aren't the same.
17801 EVT VT = N->getValueType(0);
17802 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
17803 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
17804 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
17805 return SDValue();
17806
17807 // Now check that the other operand of the AND is a constant. We could
17808 // make the transformation for non-constant splats as well, but it's unclear
17809 // that would be a benefit as it would not eliminate any operations, just
17810 // perform one more step in scalar code before moving to the vector unit.
17811 if (BuildVectorSDNode *BV =
17812 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
17813 // Bail out if the vector isn't a constant.
17814 if (!BV->isConstant())
17815 return SDValue();
17816
17817 // Everything checks out. Build up the new and improved node.
17818 SDLoc DL(N);
17819 EVT IntVT = BV->getValueType(0);
17820 // Create a new constant of the appropriate type for the transformed
17821 // DAG.
17822 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
17823 // The AND node needs bitcasts to/from an integer vector type around it.
17824 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
17825 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
17826 N->getOperand(0)->getOperand(0), MaskConst);
17827 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
17828 return Res;
17829 }
17830
17831 return SDValue();
17832}
17833
17835 const AArch64Subtarget *Subtarget) {
17836 // First try to optimize away the conversion when it's conditionally from
17837 // a constant. Vectors only.
17839 return Res;
17840
17841 EVT VT = N->getValueType(0);
17842 if (VT != MVT::f32 && VT != MVT::f64)
17843 return SDValue();
17844
17845 // Only optimize when the source and destination types have the same width.
17846 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
17847 return SDValue();
17848
17849 // If the result of an integer load is only used by an integer-to-float
17850 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
17851 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
17852 SDValue N0 = N->getOperand(0);
17853 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
17854 N0.hasOneUse() &&
17855 // Do not change the width of a volatile load.
17856 !cast<LoadSDNode>(N0)->isVolatile()) {
17857 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
17858 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
17859 LN0->getPointerInfo(), LN0->getAlign(),
17860 LN0->getMemOperand()->getFlags());
17861
17862 // Make sure successors of the original load stay after it by updating them
17863 // to use the new Chain.
17864 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
17865
17866 unsigned Opcode =
17868 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
17869 }
17870
17871 return SDValue();
17872}
17873
17874/// Fold a floating-point multiply by power of two into floating-point to
17875/// fixed-point conversion.
17878 const AArch64Subtarget *Subtarget) {
17879 if (!Subtarget->isNeonAvailable())
17880 return SDValue();
17881
17882 if (!N->getValueType(0).isSimple())
17883 return SDValue();
17884
17885 SDValue Op = N->getOperand(0);
17886 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
17887 return SDValue();
17888
17889 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
17890 return SDValue();
17891
17892 SDValue ConstVec = Op->getOperand(1);
17893 if (!isa<BuildVectorSDNode>(ConstVec))
17894 return SDValue();
17895
17896 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17897 uint32_t FloatBits = FloatTy.getSizeInBits();
17898 if (FloatBits != 32 && FloatBits != 64 &&
17899 (FloatBits != 16 || !Subtarget->hasFullFP16()))
17900 return SDValue();
17901
17902 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17903 uint32_t IntBits = IntTy.getSizeInBits();
17904 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17905 return SDValue();
17906
17907 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
17908 if (IntBits > FloatBits)
17909 return SDValue();
17910
17911 BitVector UndefElements;
17912 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17913 int32_t Bits = IntBits == 64 ? 64 : 32;
17914 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
17915 if (C == -1 || C == 0 || C > Bits)
17916 return SDValue();
17917
17918 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
17919 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
17920 return SDValue();
17921
17922 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
17923 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
17924 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
17925 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
17926 return SDValue();
17927 }
17928
17929 SDLoc DL(N);
17930 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
17931 N->getOpcode() == ISD::FP_TO_SINT_SAT);
17932 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
17933 : Intrinsic::aarch64_neon_vcvtfp2fxu;
17934 SDValue FixConv =
17936 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
17937 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
17938 // We can handle smaller integers by generating an extra trunc.
17939 if (IntBits < FloatBits)
17940 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
17941
17942 return FixConv;
17943}
17944
17945/// Fold a floating-point divide by power of two into fixed-point to
17946/// floating-point conversion.
17949 const AArch64Subtarget *Subtarget) {
17950 if (!Subtarget->hasNEON())
17951 return SDValue();
17952
17953 SDValue Op = N->getOperand(0);
17954 unsigned Opc = Op->getOpcode();
17955 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17956 !Op.getOperand(0).getValueType().isSimple() ||
17957 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
17958 return SDValue();
17959
17960 SDValue ConstVec = N->getOperand(1);
17961 if (!isa<BuildVectorSDNode>(ConstVec))
17962 return SDValue();
17963
17964 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17965 int32_t IntBits = IntTy.getSizeInBits();
17966 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
17967 return SDValue();
17968
17969 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17970 int32_t FloatBits = FloatTy.getSizeInBits();
17971 if (FloatBits != 32 && FloatBits != 64)
17972 return SDValue();
17973
17974 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
17975 if (IntBits > FloatBits)
17976 return SDValue();
17977
17978 BitVector UndefElements;
17979 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
17980 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
17981 if (C == -1 || C == 0 || C > FloatBits)
17982 return SDValue();
17983
17984 MVT ResTy;
17985 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17986 switch (NumLanes) {
17987 default:
17988 return SDValue();
17989 case 2:
17990 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
17991 break;
17992 case 4:
17993 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
17994 break;
17995 }
17996
17997 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
17998 return SDValue();
17999
18000 SDLoc DL(N);
18001 SDValue ConvInput = Op.getOperand(0);
18002 bool IsSigned = Opc == ISD::SINT_TO_FP;
18003 if (IntBits < FloatBits)
18004 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
18005 ResTy, ConvInput);
18006
18007 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
18008 : Intrinsic::aarch64_neon_vcvtfxu2fp;
18009 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
18010 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
18011 DAG.getConstant(C, DL, MVT::i32));
18012}
18013
18015 const AArch64TargetLowering &TLI) {
18016 EVT VT = N->getValueType(0);
18017 SelectionDAG &DAG = DCI.DAG;
18018 SDLoc DL(N);
18019 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
18020
18021 if (!VT.isVector())
18022 return SDValue();
18023
18024 // The combining code works for NEON, SVE2 and SME.
18025 if (TLI.useSVEForFixedLengthVectorVT(VT, !Subtarget.isNeonAvailable()) ||
18026 (VT.isScalableVector() && !Subtarget.hasSVE2()))
18027 return SDValue();
18028
18029 SDValue N0 = N->getOperand(0);
18030 if (N0.getOpcode() != ISD::AND)
18031 return SDValue();
18032
18033 SDValue N1 = N->getOperand(1);
18034 if (N1.getOpcode() != ISD::AND)
18035 return SDValue();
18036
18037 // InstCombine does (not (neg a)) => (add a -1).
18038 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
18039 // Loop over all combinations of AND operands.
18040 for (int i = 1; i >= 0; --i) {
18041 for (int j = 1; j >= 0; --j) {
18042 SDValue O0 = N0->getOperand(i);
18043 SDValue O1 = N1->getOperand(j);
18044 SDValue Sub, Add, SubSibling, AddSibling;
18045
18046 // Find a SUB and an ADD operand, one from each AND.
18047 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18048 Sub = O0;
18049 Add = O1;
18050 SubSibling = N0->getOperand(1 - i);
18051 AddSibling = N1->getOperand(1 - j);
18052 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18053 Add = O0;
18054 Sub = O1;
18055 AddSibling = N0->getOperand(1 - i);
18056 SubSibling = N1->getOperand(1 - j);
18057 } else
18058 continue;
18059
18061 continue;
18062
18063 // Constant ones is always righthand operand of the Add.
18064 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
18065 continue;
18066
18067 if (Sub.getOperand(1) != Add.getOperand(0))
18068 continue;
18069
18070 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
18071 }
18072 }
18073
18074 // (or (and a b) (and (not a) c)) => (bsl a b c)
18075 // We only have to look for constant vectors here since the general, variable
18076 // case can be handled in TableGen.
18077 unsigned Bits = VT.getScalarSizeInBits();
18078 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18079 for (int i = 1; i >= 0; --i)
18080 for (int j = 1; j >= 0; --j) {
18081 APInt Val1, Val2;
18082
18083 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
18085 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18086 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18087 N0->getOperand(1 - i), N1->getOperand(1 - j));
18088 }
18089 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
18090 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
18091 if (!BVN0 || !BVN1)
18092 continue;
18093
18094 bool FoundMatch = true;
18095 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18096 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
18097 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
18098 if (!CN0 || !CN1 ||
18099 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18100 FoundMatch = false;
18101 break;
18102 }
18103 }
18104 if (FoundMatch)
18105 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18106 N0->getOperand(1 - i), N1->getOperand(1 - j));
18107 }
18108
18109 return SDValue();
18110}
18111
18112// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18113// convert to csel(ccmp(.., cc0)), depending on cc1:
18114
18115// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18116// =>
18117// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18118//
18119// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18120// =>
18121// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18123 EVT VT = N->getValueType(0);
18124 SDValue CSel0 = N->getOperand(0);
18125 SDValue CSel1 = N->getOperand(1);
18126
18127 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18128 CSel1.getOpcode() != AArch64ISD::CSEL)
18129 return SDValue();
18130
18131 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18132 return SDValue();
18133
18134 if (!isNullConstant(CSel0.getOperand(0)) ||
18135 !isOneConstant(CSel0.getOperand(1)) ||
18136 !isNullConstant(CSel1.getOperand(0)) ||
18137 !isOneConstant(CSel1.getOperand(1)))
18138 return SDValue();
18139
18140 SDValue Cmp0 = CSel0.getOperand(3);
18141 SDValue Cmp1 = CSel1.getOperand(3);
18144 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18145 return SDValue();
18146 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18147 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18148 std::swap(Cmp0, Cmp1);
18149 std::swap(CC0, CC1);
18150 }
18151
18152 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18153 return SDValue();
18154
18155 SDLoc DL(N);
18156 SDValue CCmp, Condition;
18157 unsigned NZCV;
18158
18159 if (N->getOpcode() == ISD::AND) {
18161 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18163 } else {
18165 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18167 }
18168
18169 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18170
18171 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18172 if (Op1 && Op1->getAPIntValue().isNegative() &&
18173 Op1->getAPIntValue().sgt(-32)) {
18174 // CCMP accept the constant int the range [0, 31]
18175 // if the Op1 is a constant in the range [-31, -1], we
18176 // can select to CCMN to avoid the extra mov
18177 SDValue AbsOp1 =
18178 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18179 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18180 NZCVOp, Condition, Cmp0);
18181 } else {
18182 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18183 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18184 }
18185 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18186 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18187 CCmp);
18188}
18189
18191 const AArch64Subtarget *Subtarget,
18192 const AArch64TargetLowering &TLI) {
18193 SelectionDAG &DAG = DCI.DAG;
18194 EVT VT = N->getValueType(0);
18195
18196 if (SDValue R = performANDORCSELCombine(N, DAG))
18197 return R;
18198
18199 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18200 return SDValue();
18201
18202 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18203 return Res;
18204
18205 return SDValue();
18206}
18207
18209 if (!MemVT.getVectorElementType().isSimple())
18210 return false;
18211
18212 uint64_t MaskForTy = 0ull;
18213 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18214 case MVT::i8:
18215 MaskForTy = 0xffull;
18216 break;
18217 case MVT::i16:
18218 MaskForTy = 0xffffull;
18219 break;
18220 case MVT::i32:
18221 MaskForTy = 0xffffffffull;
18222 break;
18223 default:
18224 return false;
18225 break;
18226 }
18227
18228 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18229 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18230 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18231
18232 return false;
18233}
18234
18236 SDValue LeafOp = SDValue(N, 0);
18237 SDValue Op = N->getOperand(0);
18238 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18239 LeafOp.getValueType() != Op.getValueType())
18240 Op = Op->getOperand(0);
18241 if (LeafOp.getValueType() == Op.getValueType())
18242 return Op;
18243 return SDValue();
18244}
18245
18248 SelectionDAG &DAG = DCI.DAG;
18249 SDValue Src = N->getOperand(0);
18250 unsigned Opc = Src->getOpcode();
18251
18252 // Zero/any extend of an unsigned unpack
18253 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18254 SDValue UnpkOp = Src->getOperand(0);
18255 SDValue Dup = N->getOperand(1);
18256
18257 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18258 return SDValue();
18259
18260 SDLoc DL(N);
18261 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18262 if (!C)
18263 return SDValue();
18264
18265 uint64_t ExtVal = C->getZExtValue();
18266
18267 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18268 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18269 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18270 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18271 };
18272
18273 // If the mask is fully covered by the unpack, we don't need to push
18274 // a new AND onto the operand
18275 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18276 if (MaskAndTypeMatch(EltTy))
18277 return Src;
18278
18279 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18280 // to see if the mask is all-ones of size MemTy.
18281 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18282 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18283 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18284 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18285 if (MaskAndTypeMatch(EltTy))
18286 return Src;
18287 }
18288
18289 // Truncate to prevent a DUP with an over wide constant
18290 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18291
18292 // Otherwise, make sure we propagate the AND to the operand
18293 // of the unpack
18294 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18295 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18296
18297 SDValue And = DAG.getNode(ISD::AND, DL,
18298 UnpkOp->getValueType(0), UnpkOp, Dup);
18299
18300 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18301 }
18302
18303 if (DCI.isBeforeLegalizeOps())
18304 return SDValue();
18305
18306 // If both sides of AND operations are i1 splat_vectors then
18307 // we can produce just i1 splat_vector as the result.
18308 if (isAllActivePredicate(DAG, N->getOperand(0)))
18309 return N->getOperand(1);
18310 if (isAllActivePredicate(DAG, N->getOperand(1)))
18311 return N->getOperand(0);
18312
18314 return SDValue();
18315
18316 SDValue Mask = N->getOperand(1);
18317
18318 if (!Src.hasOneUse())
18319 return SDValue();
18320
18321 EVT MemVT;
18322
18323 // SVE load instructions perform an implicit zero-extend, which makes them
18324 // perfect candidates for combining.
18325 switch (Opc) {
18329 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18330 break;
18346 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18347 break;
18348 default:
18349 return SDValue();
18350 }
18351
18352 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18353 return Src;
18354
18355 return SDValue();
18356}
18357
18358// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18361
18362 // This function performs an optimization on a specific pattern involving
18363 // an AND operation and SETCC (Set Condition Code) node.
18364
18365 SDValue SetCC = N->getOperand(0);
18366 EVT VT = N->getValueType(0);
18367 SelectionDAG &DAG = DCI.DAG;
18368
18369 // Checks if the current node (N) is used by any SELECT instruction and
18370 // returns an empty SDValue to avoid applying the optimization to prevent
18371 // incorrect results
18372 for (auto U : N->uses())
18373 if (U->getOpcode() == ISD::SELECT)
18374 return SDValue();
18375
18376 // Check if the operand is a SETCC node with floating-point comparison
18377 if (SetCC.getOpcode() == ISD::SETCC &&
18378 SetCC.getOperand(0).getValueType() == MVT::f32) {
18379
18380 SDValue Cmp;
18382
18383 // Check if the DAG is after legalization and if we can emit the conjunction
18384 if (!DCI.isBeforeLegalize() &&
18385 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
18386
18388
18389 SDLoc DL(N);
18390 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
18391 DAG.getConstant(0, DL, VT),
18392 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
18393 }
18394 }
18395 return SDValue();
18396}
18397
18400 SelectionDAG &DAG = DCI.DAG;
18401 SDValue LHS = N->getOperand(0);
18402 SDValue RHS = N->getOperand(1);
18403 EVT VT = N->getValueType(0);
18404
18405 if (SDValue R = performANDORCSELCombine(N, DAG))
18406 return R;
18407
18408 if (SDValue R = performANDSETCCCombine(N,DCI))
18409 return R;
18410
18411 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18412 return SDValue();
18413
18414 if (VT.isScalableVector())
18415 return performSVEAndCombine(N, DCI);
18416
18417 // The combining code below works only for NEON vectors. In particular, it
18418 // does not work for SVE when dealing with vectors wider than 128 bits.
18419 if (!VT.is64BitVector() && !VT.is128BitVector())
18420 return SDValue();
18421
18422 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
18423 if (!BVN)
18424 return SDValue();
18425
18426 // AND does not accept an immediate, so check if we can use a BIC immediate
18427 // instruction instead. We do this here instead of using a (and x, (mvni imm))
18428 // pattern in isel, because some immediates may be lowered to the preferred
18429 // (and x, (movi imm)) form, even though an mvni representation also exists.
18430 APInt DefBits(VT.getSizeInBits(), 0);
18431 APInt UndefBits(VT.getSizeInBits(), 0);
18432 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
18433 SDValue NewOp;
18434
18435 // Any bits known to already be 0 need not be cleared again, which can help
18436 // reduce the size of the immediate to one supported by the instruction.
18437 KnownBits Known = DAG.computeKnownBits(LHS);
18438 APInt ZeroSplat(VT.getSizeInBits(), 0);
18439 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
18440 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
18441 << (Known.Zero.getBitWidth() * I);
18442
18443 DefBits = ~(DefBits | ZeroSplat);
18444 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18445 DefBits, &LHS)) ||
18446 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18447 DefBits, &LHS)))
18448 return NewOp;
18449
18450 UndefBits = ~(UndefBits | ZeroSplat);
18451 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
18452 UndefBits, &LHS)) ||
18453 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
18454 UndefBits, &LHS)))
18455 return NewOp;
18456 }
18457
18458 return SDValue();
18459}
18460
18463 SelectionDAG &DAG = DCI.DAG;
18464 SDValue LHS = N->getOperand(0);
18465 SDValue RHS = N->getOperand(1);
18466 EVT VT = N->getValueType(0);
18467 SDLoc DL(N);
18468
18469 if (!N->getFlags().hasAllowReassociation())
18470 return SDValue();
18471
18472 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
18473 auto ReassocComplex = [&](SDValue A, SDValue B) {
18474 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
18475 return SDValue();
18476 unsigned Opc = A.getConstantOperandVal(0);
18477 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
18478 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
18479 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
18480 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
18481 return SDValue();
18482 SDValue VCMLA = DAG.getNode(
18483 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
18484 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
18485 A.getOperand(2), A.getOperand(3));
18486 VCMLA->setFlags(A->getFlags());
18487 return VCMLA;
18488 };
18489 if (SDValue R = ReassocComplex(LHS, RHS))
18490 return R;
18491 if (SDValue R = ReassocComplex(RHS, LHS))
18492 return R;
18493
18494 return SDValue();
18495}
18496
18497static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
18498 switch (Opcode) {
18499 case ISD::STRICT_FADD:
18500 case ISD::FADD:
18501 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
18502 case ISD::ADD:
18503 return VT == MVT::i64;
18504 default:
18505 return false;
18506 }
18507}
18508
18509static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
18511
18513 if ((N.getOpcode() == ISD::SETCC) ||
18514 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
18515 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
18516 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
18517 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
18518 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
18519 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
18520 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
18521 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
18522 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
18523 // get_active_lane_mask is lowered to a whilelo instruction.
18524 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
18525 return true;
18526
18527 return false;
18528}
18529
18530// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
18531// ... into: "ptrue p, all" + PTEST
18532static SDValue
18535 const AArch64Subtarget *Subtarget) {
18536 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18537 // Make sure PTEST can be legalised with illegal types.
18538 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18539 return SDValue();
18540
18541 SDValue N0 = N->getOperand(0);
18542 EVT VT = N0.getValueType();
18543
18544 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
18545 !isNullConstant(N->getOperand(1)))
18546 return SDValue();
18547
18548 // Restricted the DAG combine to only cases where we're extracting from a
18549 // flag-setting operation.
18550 if (!isPredicateCCSettingOp(N0))
18551 return SDValue();
18552
18553 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
18554 SelectionDAG &DAG = DCI.DAG;
18555 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
18556 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
18557}
18558
18559// Materialize : Idx = (add (mul vscale, NumEls), -1)
18560// i1 = extract_vector_elt t37, Constant:i64<Idx>
18561// ... into: "ptrue p, all" + PTEST
18562static SDValue
18565 const AArch64Subtarget *Subtarget) {
18566 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18567 // Make sure PTEST is legal types.
18568 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
18569 return SDValue();
18570
18571 SDValue N0 = N->getOperand(0);
18572 EVT OpVT = N0.getValueType();
18573
18574 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
18575 return SDValue();
18576
18577 // Idx == (add (mul vscale, NumEls), -1)
18578 SDValue Idx = N->getOperand(1);
18579 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
18580 return SDValue();
18581
18582 SDValue VS = Idx.getOperand(0);
18583 if (VS.getOpcode() != ISD::VSCALE)
18584 return SDValue();
18585
18586 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
18587 if (VS.getConstantOperandVal(0) != NumEls)
18588 return SDValue();
18589
18590 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
18591 SelectionDAG &DAG = DCI.DAG;
18592 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
18593 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
18594}
18595
18596static SDValue
18598 const AArch64Subtarget *Subtarget) {
18599 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
18600 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
18601 return Res;
18602 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
18603 return Res;
18604
18605 SelectionDAG &DAG = DCI.DAG;
18606 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18607
18608 EVT VT = N->getValueType(0);
18609 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
18610 bool IsStrict = N0->isStrictFPOpcode();
18611
18612 // extract(dup x) -> x
18613 if (N0.getOpcode() == AArch64ISD::DUP)
18614 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
18615 : N0.getOperand(0);
18616
18617 // Rewrite for pairwise fadd pattern
18618 // (f32 (extract_vector_elt
18619 // (fadd (vXf32 Other)
18620 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
18621 // ->
18622 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
18623 // (extract_vector_elt (vXf32 Other) 1))
18624 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
18625 // we can only do this when it's used only by the extract_vector_elt.
18626 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
18627 (!IsStrict || N0.hasOneUse())) {
18628 SDLoc DL(N0);
18629 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
18630 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
18631
18632 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
18633 SDValue Other = N00;
18634
18635 // And handle the commutative case.
18636 if (!Shuffle) {
18637 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
18638 Other = N01;
18639 }
18640
18641 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
18642 Other == Shuffle->getOperand(0)) {
18643 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18644 DAG.getConstant(0, DL, MVT::i64));
18645 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
18646 DAG.getConstant(1, DL, MVT::i64));
18647 if (!IsStrict)
18648 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
18649
18650 // For strict_fadd we need uses of the final extract_vector to be replaced
18651 // with the strict_fadd, but we also need uses of the chain output of the
18652 // original strict_fadd to use the chain output of the new strict_fadd as
18653 // otherwise it may not be deleted.
18654 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
18655 {VT, MVT::Other},
18656 {N0->getOperand(0), Extract1, Extract2});
18657 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
18658 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
18659 return SDValue(N, 0);
18660 }
18661 }
18662
18663 return SDValue();
18664}
18665
18668 SelectionDAG &DAG) {
18669 SDLoc dl(N);
18670 EVT VT = N->getValueType(0);
18671 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
18672 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
18673
18674 if (VT.isScalableVector())
18675 return SDValue();
18676
18677 // Optimize concat_vectors of truncated vectors, where the intermediate
18678 // type is illegal, to avoid said illegality, e.g.,
18679 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
18680 // (v2i16 (truncate (v2i64)))))
18681 // ->
18682 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
18683 // (v4i32 (bitcast (v2i64))),
18684 // <0, 2, 4, 6>)))
18685 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
18686 // on both input and result type, so we might generate worse code.
18687 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
18688 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18689 N1Opc == ISD::TRUNCATE) {
18690 SDValue N00 = N0->getOperand(0);
18691 SDValue N10 = N1->getOperand(0);
18692 EVT N00VT = N00.getValueType();
18693
18694 if (N00VT == N10.getValueType() &&
18695 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
18696 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
18697 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
18699 for (size_t i = 0; i < Mask.size(); ++i)
18700 Mask[i] = i * 2;
18701 return DAG.getNode(ISD::TRUNCATE, dl, VT,
18702 DAG.getVectorShuffle(
18703 MidVT, dl,
18704 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
18705 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
18706 }
18707 }
18708
18709 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
18710 N->getOperand(0).getValueType() == MVT::v2i16 ||
18711 N->getOperand(0).getValueType() == MVT::v2i8) {
18712 EVT SrcVT = N->getOperand(0).getValueType();
18713 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
18714 // loads to prevent having to go through the v4i8 load legalization that
18715 // needs to extend each element into a larger type.
18716 if (N->getNumOperands() % 2 == 0 &&
18717 all_of(N->op_values(), [SrcVT](SDValue V) {
18718 if (V.getValueType() != SrcVT)
18719 return false;
18720 if (V.isUndef())
18721 return true;
18722 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
18723 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
18724 LD->getExtensionType() == ISD::NON_EXTLOAD;
18725 })) {
18726 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
18727 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
18729
18730 for (unsigned i = 0; i < N->getNumOperands(); i++) {
18731 SDValue V = N->getOperand(i);
18732 if (V.isUndef())
18733 Ops.push_back(DAG.getUNDEF(FVT));
18734 else {
18735 LoadSDNode *LD = cast<LoadSDNode>(V);
18736 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
18737 LD->getBasePtr(), LD->getMemOperand());
18738 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
18739 Ops.push_back(NewLoad);
18740 }
18741 }
18742 return DAG.getBitcast(N->getValueType(0),
18743 DAG.getBuildVector(NVT, dl, Ops));
18744 }
18745 }
18746
18747 // Canonicalise concat_vectors to replace concatenations of truncated nots
18748 // with nots of concatenated truncates. This in some cases allows for multiple
18749 // redundant negations to be eliminated.
18750 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
18751 // (v4i16 (truncate (not (v4i32)))))
18752 // ->
18753 // (not (concat_vectors (v4i16 (truncate (v4i32))),
18754 // (v4i16 (truncate (v4i32)))))
18755 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
18756 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
18757 N->isOnlyUserOf(N1.getNode())) {
18758 auto isBitwiseVectorNegate = [](SDValue V) {
18759 return V->getOpcode() == ISD::XOR &&
18760 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
18761 };
18762 SDValue N00 = N0->getOperand(0);
18763 SDValue N10 = N1->getOperand(0);
18764 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
18765 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
18766 return DAG.getNOT(
18767 dl,
18768 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18769 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
18770 N00->getOperand(0)),
18771 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
18772 N10->getOperand(0))),
18773 VT);
18774 }
18775 }
18776
18777 // Wait till after everything is legalized to try this. That way we have
18778 // legal vector types and such.
18779 if (DCI.isBeforeLegalizeOps())
18780 return SDValue();
18781
18782 // Optimise concat_vectors of two identical binops with a 128-bit destination
18783 // size, combine into an binop of two contacts of the source vectors. eg:
18784 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
18785 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
18786 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
18787 N1->hasOneUse()) {
18788 SDValue N00 = N0->getOperand(0);
18789 SDValue N01 = N0->getOperand(1);
18790 SDValue N10 = N1->getOperand(0);
18791 SDValue N11 = N1->getOperand(1);
18792
18793 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
18794 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
18795 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
18796 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
18797 }
18798 }
18799
18800 auto IsRSHRN = [](SDValue Shr) {
18801 if (Shr.getOpcode() != AArch64ISD::VLSHR)
18802 return false;
18803 SDValue Op = Shr.getOperand(0);
18804 EVT VT = Op.getValueType();
18805 unsigned ShtAmt = Shr.getConstantOperandVal(1);
18806 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
18807 return false;
18808
18809 APInt Imm;
18810 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
18811 Imm = APInt(VT.getScalarSizeInBits(),
18812 Op.getOperand(1).getConstantOperandVal(0)
18813 << Op.getOperand(1).getConstantOperandVal(1));
18814 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
18815 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
18816 Imm = APInt(VT.getScalarSizeInBits(),
18817 Op.getOperand(1).getConstantOperandVal(0));
18818 else
18819 return false;
18820
18821 if (Imm != 1ULL << (ShtAmt - 1))
18822 return false;
18823 return true;
18824 };
18825
18826 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
18827 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
18828 ((IsRSHRN(N1) &&
18830 N1.isUndef())) {
18831 SDValue X = N0.getOperand(0).getOperand(0);
18832 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
18833 : N1.getOperand(0).getOperand(0);
18834 EVT BVT =
18835 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18836 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
18837 SDValue Add = DAG.getNode(
18838 ISD::ADD, dl, BVT, CC,
18839 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
18840 SDValue Shr =
18841 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
18842 return Shr;
18843 }
18844
18845 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
18846 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
18847 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
18848 N0.getOperand(1) == N1.getOperand(1)) {
18849 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
18850 DAG.getUNDEF(N0.getValueType()));
18851 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
18852 DAG.getUNDEF(N0.getValueType()));
18853 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
18854 }
18855
18856 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
18857 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
18858 // canonicalise to that.
18859 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
18860 assert(VT.getScalarSizeInBits() == 64);
18861 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
18862 DAG.getConstant(0, dl, MVT::i64));
18863 }
18864
18865 // Canonicalise concat_vectors so that the right-hand vector has as few
18866 // bit-casts as possible before its real operation. The primary matching
18867 // destination for these operations will be the narrowing "2" instructions,
18868 // which depend on the operation being performed on this right-hand vector.
18869 // For example,
18870 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
18871 // becomes
18872 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
18873
18874 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
18875 return SDValue();
18876 SDValue RHS = N1->getOperand(0);
18877 MVT RHSTy = RHS.getValueType().getSimpleVT();
18878 // If the RHS is not a vector, this is not the pattern we're looking for.
18879 if (!RHSTy.isVector())
18880 return SDValue();
18881
18882 LLVM_DEBUG(
18883 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
18884
18885 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
18886 RHSTy.getVectorNumElements() * 2);
18887 return DAG.getNode(ISD::BITCAST, dl, VT,
18888 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
18889 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
18890 RHS));
18891}
18892
18893static SDValue
18895 SelectionDAG &DAG) {
18896 if (DCI.isBeforeLegalizeOps())
18897 return SDValue();
18898
18899 EVT VT = N->getValueType(0);
18900 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
18901 return SDValue();
18902
18903 SDValue V = N->getOperand(0);
18904
18905 // NOTE: This combine exists in DAGCombiner, but that version's legality check
18906 // blocks this combine because the non-const case requires custom lowering.
18907 //
18908 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
18909 if (V.getOpcode() == ISD::SPLAT_VECTOR)
18910 if (isa<ConstantSDNode>(V.getOperand(0)))
18911 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
18912
18913 return SDValue();
18914}
18915
18916static SDValue
18918 SelectionDAG &DAG) {
18919 SDLoc DL(N);
18920 SDValue Vec = N->getOperand(0);
18921 SDValue SubVec = N->getOperand(1);
18922 uint64_t IdxVal = N->getConstantOperandVal(2);
18923 EVT VecVT = Vec.getValueType();
18924 EVT SubVT = SubVec.getValueType();
18925
18926 // Only do this for legal fixed vector types.
18927 if (!VecVT.isFixedLengthVector() ||
18928 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
18929 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
18930 return SDValue();
18931
18932 // Ignore widening patterns.
18933 if (IdxVal == 0 && Vec.isUndef())
18934 return SDValue();
18935
18936 // Subvector must be half the width and an "aligned" insertion.
18937 unsigned NumSubElts = SubVT.getVectorNumElements();
18938 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
18939 (IdxVal != 0 && IdxVal != NumSubElts))
18940 return SDValue();
18941
18942 // Fold insert_subvector -> concat_vectors
18943 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
18944 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
18945 SDValue Lo, Hi;
18946 if (IdxVal == 0) {
18947 Lo = SubVec;
18948 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18949 DAG.getVectorIdxConstant(NumSubElts, DL));
18950 } else {
18951 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
18952 DAG.getVectorIdxConstant(0, DL));
18953 Hi = SubVec;
18954 }
18955 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
18956}
18957
18960 SelectionDAG &DAG) {
18961 // Wait until after everything is legalized to try this. That way we have
18962 // legal vector types and such.
18963 if (DCI.isBeforeLegalizeOps())
18964 return SDValue();
18965 // Transform a scalar conversion of a value from a lane extract into a
18966 // lane extract of a vector conversion. E.g., from foo1 to foo2:
18967 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
18968 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
18969 //
18970 // The second form interacts better with instruction selection and the
18971 // register allocator to avoid cross-class register copies that aren't
18972 // coalescable due to a lane reference.
18973
18974 // Check the operand and see if it originates from a lane extract.
18975 SDValue Op1 = N->getOperand(1);
18977 return SDValue();
18978
18979 // Yep, no additional predication needed. Perform the transform.
18980 SDValue IID = N->getOperand(0);
18981 SDValue Shift = N->getOperand(2);
18982 SDValue Vec = Op1.getOperand(0);
18983 SDValue Lane = Op1.getOperand(1);
18984 EVT ResTy = N->getValueType(0);
18985 EVT VecResTy;
18986 SDLoc DL(N);
18987
18988 // The vector width should be 128 bits by the time we get here, even
18989 // if it started as 64 bits (the extract_vector handling will have
18990 // done so). Bail if it is not.
18991 if (Vec.getValueSizeInBits() != 128)
18992 return SDValue();
18993
18994 if (Vec.getValueType() == MVT::v4i32)
18995 VecResTy = MVT::v4f32;
18996 else if (Vec.getValueType() == MVT::v2i64)
18997 VecResTy = MVT::v2f64;
18998 else
18999 return SDValue();
19000
19001 SDValue Convert =
19002 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
19003 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
19004}
19005
19006// AArch64 high-vector "long" operations are formed by performing the non-high
19007// version on an extract_subvector of each operand which gets the high half:
19008//
19009// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
19010//
19011// However, there are cases which don't have an extract_high explicitly, but
19012// have another operation that can be made compatible with one for free. For
19013// example:
19014//
19015// (dupv64 scalar) --> (extract_high (dup128 scalar))
19016//
19017// This routine does the actual conversion of such DUPs, once outer routines
19018// have determined that everything else is in order.
19019// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
19020// similarly here.
19022 MVT VT = N.getSimpleValueType();
19023 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
19024 N.getConstantOperandVal(1) == 0)
19025 N = N.getOperand(0);
19026
19027 switch (N.getOpcode()) {
19028 case AArch64ISD::DUP:
19033 case AArch64ISD::MOVI:
19039 break;
19040 default:
19041 // FMOV could be supported, but isn't very useful, as it would only occur
19042 // if you passed a bitcast' floating point immediate to an eligible long
19043 // integer op (addl, smull, ...).
19044 return SDValue();
19045 }
19046
19047 if (!VT.is64BitVector())
19048 return SDValue();
19049
19050 SDLoc DL(N);
19051 unsigned NumElems = VT.getVectorNumElements();
19052 if (N.getValueType().is64BitVector()) {
19053 MVT ElementTy = VT.getVectorElementType();
19054 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
19055 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
19056 }
19057
19058 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
19059 DAG.getConstant(NumElems, DL, MVT::i64));
19060}
19061
19063 if (N.getOpcode() == ISD::BITCAST)
19064 N = N.getOperand(0);
19065 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19066 return false;
19067 if (N.getOperand(0).getValueType().isScalableVector())
19068 return false;
19069 return N.getConstantOperandAPInt(1) ==
19070 N.getOperand(0).getValueType().getVectorNumElements() / 2;
19071}
19072
19073/// Helper structure to keep track of ISD::SET_CC operands.
19078};
19079
19080/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19082 const SDValue *Cmp;
19084};
19085
19086/// Helper structure to keep track of SetCC information.
19090};
19091
19092/// Helper structure to be able to read SetCC information. If set to
19093/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19094/// GenericSetCCInfo.
19098};
19099
19100/// Check whether or not \p Op is a SET_CC operation, either a generic or
19101/// an
19102/// AArch64 lowered one.
19103/// \p SetCCInfo is filled accordingly.
19104/// \post SetCCInfo is meanginfull only when this function returns true.
19105/// \return True when Op is a kind of SET_CC operation.
19107 // If this is a setcc, this is straight forward.
19108 if (Op.getOpcode() == ISD::SETCC) {
19109 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
19110 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
19111 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19112 SetCCInfo.IsAArch64 = false;
19113 return true;
19114 }
19115 // Otherwise, check if this is a matching csel instruction.
19116 // In other words:
19117 // - csel 1, 0, cc
19118 // - csel 0, 1, !cc
19119 if (Op.getOpcode() != AArch64ISD::CSEL)
19120 return false;
19121 // Set the information about the operands.
19122 // TODO: we want the operands of the Cmp not the csel
19123 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19124 SetCCInfo.IsAArch64 = true;
19125 SetCCInfo.Info.AArch64.CC =
19126 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19127
19128 // Check that the operands matches the constraints:
19129 // (1) Both operands must be constants.
19130 // (2) One must be 1 and the other must be 0.
19131 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19132 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19133
19134 // Check (1).
19135 if (!TValue || !FValue)
19136 return false;
19137
19138 // Check (2).
19139 if (!TValue->isOne()) {
19140 // Update the comparison when we are interested in !cc.
19141 std::swap(TValue, FValue);
19142 SetCCInfo.Info.AArch64.CC =
19144 }
19145 return TValue->isOne() && FValue->isZero();
19146}
19147
19148// Returns true if Op is setcc or zext of setcc.
19149static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19150 if (isSetCC(Op, Info))
19151 return true;
19152 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19153 isSetCC(Op->getOperand(0), Info));
19154}
19155
19156// The folding we want to perform is:
19157// (add x, [zext] (setcc cc ...) )
19158// -->
19159// (csel x, (add x, 1), !cc ...)
19160//
19161// The latter will get matched to a CSINC instruction.
19163 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19164 SDValue LHS = Op->getOperand(0);
19165 SDValue RHS = Op->getOperand(1);
19166 SetCCInfoAndKind InfoAndKind;
19167
19168 // If both operands are a SET_CC, then we don't want to perform this
19169 // folding and create another csel as this results in more instructions
19170 // (and higher register usage).
19171 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19172 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19173 return SDValue();
19174
19175 // If neither operand is a SET_CC, give up.
19176 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19177 std::swap(LHS, RHS);
19178 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19179 return SDValue();
19180 }
19181
19182 // FIXME: This could be generatized to work for FP comparisons.
19183 EVT CmpVT = InfoAndKind.IsAArch64
19184 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19185 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19186 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19187 return SDValue();
19188
19189 SDValue CCVal;
19190 SDValue Cmp;
19191 SDLoc dl(Op);
19192 if (InfoAndKind.IsAArch64) {
19193 CCVal = DAG.getConstant(
19195 MVT::i32);
19196 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19197 } else
19198 Cmp = getAArch64Cmp(
19199 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19200 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19201 dl);
19202
19203 EVT VT = Op->getValueType(0);
19204 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19205 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19206}
19207
19208// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19210 EVT VT = N->getValueType(0);
19211 // Only scalar integer and vector types.
19212 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19213 return SDValue();
19214
19215 SDValue LHS = N->getOperand(0);
19216 SDValue RHS = N->getOperand(1);
19217 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19218 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19219 return SDValue();
19220
19221 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19222 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19223 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19224 return SDValue();
19225
19226 SDValue Op1 = LHS->getOperand(0);
19227 SDValue Op2 = RHS->getOperand(0);
19228 EVT OpVT1 = Op1.getValueType();
19229 EVT OpVT2 = Op2.getValueType();
19230 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19231 Op2.getOpcode() != AArch64ISD::UADDV ||
19232 OpVT1.getVectorElementType() != VT)
19233 return SDValue();
19234
19235 SDValue Val1 = Op1.getOperand(0);
19236 SDValue Val2 = Op2.getOperand(0);
19237 EVT ValVT = Val1->getValueType(0);
19238 SDLoc DL(N);
19239 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19240 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19241 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19242 DAG.getConstant(0, DL, MVT::i64));
19243}
19244
19245/// Perform the scalar expression combine in the form of:
19246/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19247/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19249 EVT VT = N->getValueType(0);
19250 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19251 return SDValue();
19252
19253 SDValue LHS = N->getOperand(0);
19254 SDValue RHS = N->getOperand(1);
19255
19256 // Handle commutivity.
19257 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19258 LHS.getOpcode() != AArch64ISD::CSNEG) {
19259 std::swap(LHS, RHS);
19260 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19261 LHS.getOpcode() != AArch64ISD::CSNEG) {
19262 return SDValue();
19263 }
19264 }
19265
19266 if (!LHS.hasOneUse())
19267 return SDValue();
19268
19269 AArch64CC::CondCode AArch64CC =
19270 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19271
19272 // The CSEL should include a const one operand, and the CSNEG should include
19273 // One or NegOne operand.
19274 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19275 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19276 if (!CTVal || !CFVal)
19277 return SDValue();
19278
19279 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19280 (CTVal->isOne() || CFVal->isOne())) &&
19281 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19282 (CTVal->isOne() || CFVal->isAllOnes())))
19283 return SDValue();
19284
19285 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19286 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19287 !CFVal->isOne()) {
19288 std::swap(CTVal, CFVal);
19289 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19290 }
19291
19292 SDLoc DL(N);
19293 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19294 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19295 !CFVal->isAllOnes()) {
19296 APInt C = -1 * CFVal->getAPIntValue();
19297 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19298 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19299 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19300 }
19301
19302 // It might be neutral for larger constants, as the immediate need to be
19303 // materialized in a register.
19304 APInt ADDC = CTVal->getAPIntValue();
19305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19306 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19307 return SDValue();
19308
19309 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19310 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19311 "Unexpected constant value");
19312
19313 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19314 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19315 SDValue Cmp = LHS.getOperand(3);
19316
19317 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19318}
19319
19320// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19322 EVT VT = N->getValueType(0);
19323 if (N->getOpcode() != ISD::ADD)
19324 return SDValue();
19325
19326 SDValue Dot = N->getOperand(0);
19327 SDValue A = N->getOperand(1);
19328 // Handle commutivity
19329 auto isZeroDot = [](SDValue Dot) {
19330 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19331 Dot.getOpcode() == AArch64ISD::SDOT) &&
19333 };
19334 if (!isZeroDot(Dot))
19335 std::swap(Dot, A);
19336 if (!isZeroDot(Dot))
19337 return SDValue();
19338
19339 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19340 Dot.getOperand(2));
19341}
19342
19344 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19345}
19346
19348 SDLoc DL(Op);
19349 EVT VT = Op.getValueType();
19350 SDValue Zero = DAG.getConstant(0, DL, VT);
19351 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19352}
19353
19354// Try to fold
19355//
19356// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19357//
19358// The folding helps csel to be matched with csneg without generating
19359// redundant neg instruction, which includes negation of the csel expansion
19360// of abs node lowered by lowerABS.
19362 if (!isNegatedInteger(SDValue(N, 0)))
19363 return SDValue();
19364
19365 SDValue CSel = N->getOperand(1);
19366 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19367 return SDValue();
19368
19369 SDValue N0 = CSel.getOperand(0);
19370 SDValue N1 = CSel.getOperand(1);
19371
19372 // If both of them is not negations, it's not worth the folding as it
19373 // introduces two additional negations while reducing one negation.
19374 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
19375 return SDValue();
19376
19377 SDValue N0N = getNegatedInteger(N0, DAG);
19378 SDValue N1N = getNegatedInteger(N1, DAG);
19379
19380 SDLoc DL(N);
19381 EVT VT = CSel.getValueType();
19382 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
19383 CSel.getOperand(3));
19384}
19385
19386// The basic add/sub long vector instructions have variants with "2" on the end
19387// which act on the high-half of their inputs. They are normally matched by
19388// patterns like:
19389//
19390// (add (zeroext (extract_high LHS)),
19391// (zeroext (extract_high RHS)))
19392// -> uaddl2 vD, vN, vM
19393//
19394// However, if one of the extracts is something like a duplicate, this
19395// instruction can still be used profitably. This function puts the DAG into a
19396// more appropriate form for those patterns to trigger.
19399 SelectionDAG &DAG = DCI.DAG;
19400 if (DCI.isBeforeLegalizeOps())
19401 return SDValue();
19402
19403 MVT VT = N->getSimpleValueType(0);
19404 if (!VT.is128BitVector()) {
19405 if (N->getOpcode() == ISD::ADD)
19406 return performSetccAddFolding(N, DAG);
19407 return SDValue();
19408 }
19409
19410 // Make sure both branches are extended in the same way.
19411 SDValue LHS = N->getOperand(0);
19412 SDValue RHS = N->getOperand(1);
19413 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
19414 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
19415 LHS.getOpcode() != RHS.getOpcode())
19416 return SDValue();
19417
19418 unsigned ExtType = LHS.getOpcode();
19419
19420 // It's not worth doing if at least one of the inputs isn't already an
19421 // extract, but we don't know which it'll be so we have to try both.
19422 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
19423 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
19424 if (!RHS.getNode())
19425 return SDValue();
19426
19427 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
19428 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
19429 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
19430 if (!LHS.getNode())
19431 return SDValue();
19432
19433 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
19434 }
19435
19436 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
19437}
19438
19439static bool isCMP(SDValue Op) {
19440 return Op.getOpcode() == AArch64ISD::SUBS &&
19441 !Op.getNode()->hasAnyUseOfValue(0);
19442}
19443
19444// (CSEL 1 0 CC Cond) => CC
19445// (CSEL 0 1 CC Cond) => !CC
19446static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
19447 if (Op.getOpcode() != AArch64ISD::CSEL)
19448 return std::nullopt;
19449 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19450 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
19451 return std::nullopt;
19452 SDValue OpLHS = Op.getOperand(0);
19453 SDValue OpRHS = Op.getOperand(1);
19454 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
19455 return CC;
19456 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
19457 return getInvertedCondCode(CC);
19458
19459 return std::nullopt;
19460}
19461
19462// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
19463// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
19464static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
19465 SDValue CmpOp = Op->getOperand(2);
19466 if (!isCMP(CmpOp))
19467 return SDValue();
19468
19469 if (IsAdd) {
19470 if (!isOneConstant(CmpOp.getOperand(1)))
19471 return SDValue();
19472 } else {
19473 if (!isNullConstant(CmpOp.getOperand(0)))
19474 return SDValue();
19475 }
19476
19477 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
19478 auto CC = getCSETCondCode(CsetOp);
19479 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
19480 return SDValue();
19481
19482 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
19483 Op->getOperand(0), Op->getOperand(1),
19484 CsetOp.getOperand(3));
19485}
19486
19487// (ADC x 0 cond) => (CINC x HS cond)
19489 SDValue LHS = N->getOperand(0);
19490 SDValue RHS = N->getOperand(1);
19491 SDValue Cond = N->getOperand(2);
19492
19493 if (!isNullConstant(RHS))
19494 return SDValue();
19495
19496 EVT VT = N->getValueType(0);
19497 SDLoc DL(N);
19498
19499 // (CINC x cc cond) <=> (CSINC x x !cc cond)
19500 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
19501 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
19502}
19503
19504// Transform vector add(zext i8 to i32, zext i8 to i32)
19505// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19506// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19507// extends.
19509 EVT VT = N->getValueType(0);
19510 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19511 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19512 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19513 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19514 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19515 N->getOperand(0).getOperand(0).getValueType() !=
19516 N->getOperand(1).getOperand(0).getValueType())
19517 return SDValue();
19518
19519 SDValue N0 = N->getOperand(0).getOperand(0);
19520 SDValue N1 = N->getOperand(1).getOperand(0);
19521 EVT InVT = N0.getValueType();
19522
19523 EVT S1 = InVT.getScalarType();
19524 EVT S2 = VT.getScalarType();
19525 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19526 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19527 SDLoc DL(N);
19528 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19531 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19532 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19533 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19534 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
19535 }
19536 return SDValue();
19537}
19538
19541 SelectionDAG &DAG) {
19542 SDLoc DL(N);
19543 EVT VT = N->getValueType(0);
19544
19545 if (VT == MVT::v4f16 || VT == MVT::v4bf16) {
19546 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
19547 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
19548 if (Elt0->getOpcode() == ISD::FP_ROUND &&
19549 Elt1->getOpcode() == ISD::FP_ROUND &&
19550 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19551 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19552 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
19554 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19555 // Constant index.
19556 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19557 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19558 Elt0->getOperand(0)->getOperand(0) ==
19559 Elt1->getOperand(0)->getOperand(0) &&
19560 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
19561 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
19562 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
19563 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
19564 SDValue HighLanes;
19565 if (Elt2->getOpcode() == ISD::UNDEF &&
19566 Elt3->getOpcode() == ISD::UNDEF) {
19567 HighLanes = DAG.getUNDEF(MVT::v2f32);
19568 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
19569 Elt3->getOpcode() == ISD::FP_ROUND &&
19570 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
19571 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
19572 Elt2->getConstantOperandVal(1) ==
19573 Elt3->getConstantOperandVal(1) &&
19574 Elt2->getOperand(0)->getOpcode() ==
19576 Elt3->getOperand(0)->getOpcode() ==
19578 // Constant index.
19579 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
19580 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
19581 Elt2->getOperand(0)->getOperand(0) ==
19582 Elt3->getOperand(0)->getOperand(0) &&
19583 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
19584 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
19585 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
19586 HighLanes =
19587 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
19588 }
19589 if (HighLanes) {
19590 SDValue DoubleToSingleSticky =
19591 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
19592 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
19593 DoubleToSingleSticky, HighLanes);
19594 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
19595 Elt0->getOperand(1));
19596 }
19597 }
19598 }
19599 }
19600
19601 if (VT == MVT::v2f64) {
19602 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19603 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
19604 Elt1->getOpcode() == ISD::FP_EXTEND &&
19606 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19607 Elt0->getOperand(0)->getOperand(0) ==
19608 Elt1->getOperand(0)->getOperand(0) &&
19609 // Constant index.
19610 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
19611 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
19612 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
19613 Elt1->getOperand(0)->getConstantOperandVal(1) &&
19614 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19615 // ResultType's known minimum vector length.
19616 Elt0->getOperand(0)->getConstantOperandVal(1) %
19618 0) {
19619 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
19620 if (SrcVec.getValueType() == MVT::v4f16 ||
19621 SrcVec.getValueType() == MVT::v4bf16) {
19622 SDValue HalfToSingle =
19623 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
19624 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
19625 SDValue Extract = DAG.getNode(
19627 HalfToSingle, SubvectorIdx);
19628 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
19629 }
19630 }
19631 }
19632
19633 // A build vector of two extracted elements is equivalent to an
19634 // extract subvector where the inner vector is any-extended to the
19635 // extract_vector_elt VT.
19636 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
19637 // (extract_elt_iXX_to_i32 vec Idx+1))
19638 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
19639
19640 // For now, only consider the v2i32 case, which arises as a result of
19641 // legalization.
19642 if (VT != MVT::v2i32)
19643 return SDValue();
19644
19645 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
19646 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
19647 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19648 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19649 // Constant index.
19650 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
19651 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
19652 // Both EXTRACT_VECTOR_ELT from same vector...
19653 Elt0->getOperand(0) == Elt1->getOperand(0) &&
19654 // ... and contiguous. First element's index +1 == second element's index.
19655 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
19656 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
19657 // ResultType's known minimum vector length.
19658 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
19659 SDValue VecToExtend = Elt0->getOperand(0);
19660 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
19661 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
19662 return SDValue();
19663
19664 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
19665
19666 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
19667 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
19668 SubvectorIdx);
19669 }
19670
19671 return SDValue();
19672}
19673
19675 SelectionDAG &DAG) {
19676 EVT VT = N->getValueType(0);
19677 SDValue N0 = N->getOperand(0);
19678 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
19679 N0.getOpcode() == AArch64ISD::DUP) {
19680 SDValue Op = N0.getOperand(0);
19681 if (VT.getScalarType() == MVT::i32 &&
19682 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
19683 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
19684 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
19685 }
19686
19687 return SDValue();
19688}
19689
19690// Check an node is an extend or shift operand
19692 unsigned Opcode = N.getOpcode();
19693 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
19694 EVT SrcVT;
19695 if (Opcode == ISD::SIGN_EXTEND_INREG)
19696 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
19697 else
19698 SrcVT = N.getOperand(0).getValueType();
19699
19700 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
19701 } else if (Opcode == ISD::AND) {
19702 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
19703 if (!CSD)
19704 return false;
19705 uint64_t AndMask = CSD->getZExtValue();
19706 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
19707 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
19708 return isa<ConstantSDNode>(N.getOperand(1));
19709 }
19710
19711 return false;
19712}
19713
19714// (N - Y) + Z --> (Z - Y) + N
19715// when N is an extend or shift operand
19717 SelectionDAG &DAG) {
19718 auto IsOneUseExtend = [](SDValue N) {
19719 return N.hasOneUse() && isExtendOrShiftOperand(N);
19720 };
19721
19722 // DAGCombiner will revert the combination when Z is constant cause
19723 // dead loop. So don't enable the combination when Z is constant.
19724 // If Z is one use shift C, we also can't do the optimization.
19725 // It will falling to self infinite loop.
19726 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
19727 return SDValue();
19728
19729 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
19730 return SDValue();
19731
19732 SDValue Shift = SUB.getOperand(0);
19733 if (!IsOneUseExtend(Shift))
19734 return SDValue();
19735
19736 SDLoc DL(N);
19737 EVT VT = N->getValueType(0);
19738
19739 SDValue Y = SUB.getOperand(1);
19740 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
19741 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
19742}
19743
19745 SelectionDAG &DAG) {
19746 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
19747 // commutative.
19748 if (N->getOpcode() != ISD::ADD)
19749 return SDValue();
19750
19751 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
19752 // shifted register is only available for i32 and i64.
19753 EVT VT = N->getValueType(0);
19754 if (VT != MVT::i32 && VT != MVT::i64)
19755 return SDValue();
19756
19757 SDLoc DL(N);
19758 SDValue LHS = N->getOperand(0);
19759 SDValue RHS = N->getOperand(1);
19760
19761 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
19762 return Val;
19763 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
19764 return Val;
19765
19766 uint64_t LHSImm = 0, RHSImm = 0;
19767 // If both operand are shifted by imm and shift amount is not greater than 4
19768 // for one operand, swap LHS and RHS to put operand with smaller shift amount
19769 // on RHS.
19770 //
19771 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
19772 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
19773 // with LSL (shift > 4). For the rest of processors, this is no-op for
19774 // performance or correctness.
19775 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
19776 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
19777 RHSImm > 4 && LHS.hasOneUse())
19778 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
19779
19780 return SDValue();
19781}
19782
19783// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
19784// This reassociates it back to allow the creation of more mls instructions.
19786 if (N->getOpcode() != ISD::SUB)
19787 return SDValue();
19788
19789 SDValue Add = N->getOperand(1);
19790 SDValue X = N->getOperand(0);
19791 if (Add.getOpcode() != ISD::ADD)
19792 return SDValue();
19793
19794 if (!Add.hasOneUse())
19795 return SDValue();
19797 return SDValue();
19798
19799 SDValue M1 = Add.getOperand(0);
19800 SDValue M2 = Add.getOperand(1);
19801 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
19802 M1.getOpcode() != AArch64ISD::UMULL)
19803 return SDValue();
19804 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
19806 return SDValue();
19807
19808 EVT VT = N->getValueType(0);
19809 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
19810 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
19811}
19812
19813// Combine into mla/mls.
19814// This works on the patterns of:
19815// add v1, (mul v2, v3)
19816// sub v1, (mul v2, v3)
19817// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
19818// It will transform the add/sub to a scalable version, so that we can
19819// make use of SVE's MLA/MLS that will be generated for that pattern
19820static SDValue
19822 SelectionDAG &DAG = DCI.DAG;
19823 // Make sure that the types are legal
19824 if (!DCI.isAfterLegalizeDAG())
19825 return SDValue();
19826 // Before using SVE's features, check first if it's available.
19827 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
19828 return SDValue();
19829
19830 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
19831 return SDValue();
19832
19833 if (!N->getValueType(0).isFixedLengthVector())
19834 return SDValue();
19835
19836 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
19837 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19838 return SDValue();
19839
19840 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
19841 return SDValue();
19842
19843 SDValue MulValue = Op1->getOperand(0);
19844 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
19845 return SDValue();
19846
19847 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
19848 return SDValue();
19849
19850 EVT ScalableVT = MulValue.getValueType();
19851 if (!ScalableVT.isScalableVector())
19852 return SDValue();
19853
19854 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
19855 SDValue NewValue =
19856 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
19857 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
19858 };
19859
19860 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
19861 return res;
19862 else if (N->getOpcode() == ISD::ADD)
19863 return performOpt(N->getOperand(1), N->getOperand(0));
19864
19865 return SDValue();
19866}
19867
19868// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
19869// help, for example, to produce ssra from sshr+add.
19871 EVT VT = N->getValueType(0);
19872 if (VT != MVT::i64)
19873 return SDValue();
19874 SDValue Op0 = N->getOperand(0);
19875 SDValue Op1 = N->getOperand(1);
19876
19877 // At least one of the operands should be an extract, and the other should be
19878 // something that is easy to convert to v1i64 type (in this case a load).
19879 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19880 Op0.getOpcode() != ISD::LOAD)
19881 return SDValue();
19882 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
19883 Op1.getOpcode() != ISD::LOAD)
19884 return SDValue();
19885
19886 SDLoc DL(N);
19887 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19888 Op0.getOperand(0).getValueType() == MVT::v1i64) {
19889 Op0 = Op0.getOperand(0);
19890 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
19891 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19892 Op1.getOperand(0).getValueType() == MVT::v1i64) {
19893 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
19894 Op1 = Op1.getOperand(0);
19895 } else
19896 return SDValue();
19897
19898 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
19899 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
19900 DAG.getConstant(0, DL, MVT::i64));
19901}
19902
19905 if (!BV->hasOneUse())
19906 return false;
19907 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
19908 if (!Ld || !Ld->isSimple())
19909 return false;
19910 Loads.push_back(Ld);
19911 return true;
19912 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
19914 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
19915 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
19916 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
19917 return false;
19918 Loads.push_back(Ld);
19919 }
19920 return true;
19921 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
19922 // Try to find a tree of shuffles and concats from how IR shuffles of loads
19923 // are lowered. Note that this only comes up because we do not always visit
19924 // operands before uses. After that is fixed this can be removed and in the
19925 // meantime this is fairly specific to the lowering we expect from IR.
19926 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
19927 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
19928 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
19929 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
19930 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
19931 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
19932 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
19933 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
19934 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
19935 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
19936 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
19937 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19938 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
19939 B.getOperand(1).getNumOperands() != 4)
19940 return false;
19941 auto SV1 = cast<ShuffleVectorSDNode>(B);
19942 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
19943 int NumElts = B.getValueType().getVectorNumElements();
19944 int NumSubElts = NumElts / 4;
19945 for (int I = 0; I < NumSubElts; I++) {
19946 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
19947 if (SV1->getMaskElt(I) != I ||
19948 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19949 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
19950 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
19951 return false;
19952 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
19953 if (SV2->getMaskElt(I) != I ||
19954 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
19955 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
19956 return false;
19957 }
19958 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
19959 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
19960 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
19961 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
19962 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
19963 !Ld2->isSimple() || !Ld3->isSimple())
19964 return false;
19965 Loads.push_back(Ld0);
19966 Loads.push_back(Ld1);
19967 Loads.push_back(Ld2);
19968 Loads.push_back(Ld3);
19969 return true;
19970 }
19971 return false;
19972}
19973
19975 SelectionDAG &DAG,
19976 unsigned &NumSubLoads) {
19977 if (!Op0.hasOneUse() || !Op1.hasOneUse())
19978 return false;
19979
19980 SmallVector<LoadSDNode *> Loads0, Loads1;
19981 if (isLoadOrMultipleLoads(Op0, Loads0) &&
19982 isLoadOrMultipleLoads(Op1, Loads1)) {
19983 if (NumSubLoads && Loads0.size() != NumSubLoads)
19984 return false;
19985 NumSubLoads = Loads0.size();
19986 return Loads0.size() == Loads1.size() &&
19987 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
19988 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
19989 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
19990 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
19991 Size / 8, 1);
19992 });
19993 }
19994
19995 if (Op0.getOpcode() != Op1.getOpcode())
19996 return false;
19997
19998 switch (Op0.getOpcode()) {
19999 case ISD::ADD:
20000 case ISD::SUB:
20002 DAG, NumSubLoads) &&
20004 DAG, NumSubLoads);
20005 case ISD::SIGN_EXTEND:
20006 case ISD::ANY_EXTEND:
20007 case ISD::ZERO_EXTEND:
20008 EVT XVT = Op0.getOperand(0).getValueType();
20009 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
20010 XVT.getScalarSizeInBits() != 32)
20011 return false;
20013 DAG, NumSubLoads);
20014 }
20015 return false;
20016}
20017
20018// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
20019// into a single load of twice the size, that we extract the bottom part and top
20020// part so that the shl can use a shll2 instruction. The two loads in that
20021// example can also be larger trees of instructions, which are identical except
20022// for the leaves which are all loads offset from the LHS, including
20023// buildvectors of multiple loads. For example the RHS tree could be
20024// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
20025// Whilst it can be common for the larger loads to replace LDP instructions
20026// (which doesn't gain anything on it's own), the larger loads can help create
20027// more efficient code, and in buildvectors prevent the need for ld1 lane
20028// inserts which can be slower than normal loads.
20030 EVT VT = N->getValueType(0);
20031 if (!VT.isFixedLengthVector() ||
20032 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
20033 VT.getScalarSizeInBits() != 64))
20034 return SDValue();
20035
20036 SDValue Other = N->getOperand(0);
20037 SDValue Shift = N->getOperand(1);
20038 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
20039 std::swap(Shift, Other);
20040 APInt ShiftAmt;
20041 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
20042 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
20043 return SDValue();
20044
20045 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
20046 !ISD::isExtOpcode(Other.getOpcode()) ||
20047 Shift.getOperand(0).getOperand(0).getValueType() !=
20048 Other.getOperand(0).getValueType() ||
20049 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
20050 return SDValue();
20051
20052 SDValue Op0 = Other.getOperand(0);
20053 SDValue Op1 = Shift.getOperand(0).getOperand(0);
20054
20055 unsigned NumSubLoads = 0;
20056 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20057 return SDValue();
20058
20059 // Attempt to rule out some unprofitable cases using heuristics (some working
20060 // around suboptimal code generation), notably if the extend not be able to
20061 // use ushll2 instructions as the types are not large enough. Otherwise zip's
20062 // will need to be created which can increase the instruction count.
20063 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20064 unsigned NumSubElts = NumElts / NumSubLoads;
20065 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20066 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
20067 Op0.getValueType().getSizeInBits() < 128 &&
20069 return SDValue();
20070
20071 // Recreate the tree with the new combined loads.
20072 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20073 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20074 EVT DVT =
20076
20077 SmallVector<LoadSDNode *> Loads0, Loads1;
20078 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20079 isLoadOrMultipleLoads(Op1, Loads1)) {
20080 EVT LoadVT = EVT::getVectorVT(
20081 *DAG.getContext(), Op0.getValueType().getScalarType(),
20082 Op0.getValueType().getVectorNumElements() / Loads0.size());
20083 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
20084
20085 SmallVector<SDValue> NewLoads;
20086 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
20087 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
20088 L0->getBasePtr(), L0->getPointerInfo(),
20089 L0->getOriginalAlign());
20090 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
20091 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
20092 NewLoads.push_back(Load);
20093 }
20094 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
20095 }
20096
20098 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
20099 Ops.push_back(GenCombinedTree(O0, O1, DAG));
20100 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
20101 };
20102 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20103
20104 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20105 int Hi = NumSubElts, Lo = 0;
20106 for (unsigned i = 0; i < NumSubLoads; i++) {
20107 for (unsigned j = 0; j < NumSubElts; j++) {
20108 LowMask[i * NumSubElts + j] = Lo++;
20109 HighMask[i * NumSubElts + j] = Hi++;
20110 }
20111 Lo += NumSubElts;
20112 Hi += NumSubElts;
20113 }
20114 SDLoc DL(N);
20115 SDValue Ext0, Ext1;
20116 // Extract the top and bottom lanes, then extend the result. Possibly extend
20117 // the result then extract the lanes if the two operands match as it produces
20118 // slightly smaller code.
20119 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20121 NewOp, DAG.getConstant(0, DL, MVT::i64));
20122 SDValue SubH =
20123 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20124 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20125 SDValue Extr0 =
20126 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20127 SDValue Extr1 =
20128 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20129 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20130 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20131 } else {
20133 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20134 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20135 DAG.getConstant(0, DL, MVT::i64));
20136 SDValue SubH =
20137 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20138 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20139 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20140 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20141 }
20142 SDValue NShift =
20143 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20144 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20145}
20146
20149 // Try to change sum of two reductions.
20150 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20151 return Val;
20152 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20153 return Val;
20154 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20155 return Val;
20156 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20157 return Val;
20159 return Val;
20161 return Val;
20162 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20163 return Val;
20164 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20165 return Val;
20166 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20167 return Val;
20168
20169 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20170 return Val;
20171
20172 return performAddSubLongCombine(N, DCI);
20173}
20174
20175// Massage DAGs which we can use the high-half "long" operations on into
20176// something isel will recognize better. E.g.
20177//
20178// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20179// (aarch64_neon_umull (extract_high (v2i64 vec)))
20180// (extract_high (v2i64 (dup128 scalar)))))
20181//
20184 SelectionDAG &DAG) {
20185 if (DCI.isBeforeLegalizeOps())
20186 return SDValue();
20187
20188 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20189 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20190 assert(LHS.getValueType().is64BitVector() &&
20191 RHS.getValueType().is64BitVector() &&
20192 "unexpected shape for long operation");
20193
20194 // Either node could be a DUP, but it's not worth doing both of them (you'd
20195 // just as well use the non-high version) so look for a corresponding extract
20196 // operation on the other "wing".
20199 if (!RHS.getNode())
20200 return SDValue();
20203 if (!LHS.getNode())
20204 return SDValue();
20205 } else
20206 return SDValue();
20207
20208 if (IID == Intrinsic::not_intrinsic)
20209 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20210
20211 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20212 N->getOperand(0), LHS, RHS);
20213}
20214
20215static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20216 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20217 unsigned ElemBits = ElemTy.getSizeInBits();
20218
20219 int64_t ShiftAmount;
20220 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20221 APInt SplatValue, SplatUndef;
20222 unsigned SplatBitSize;
20223 bool HasAnyUndefs;
20224 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20225 HasAnyUndefs, ElemBits) ||
20226 SplatBitSize != ElemBits)
20227 return SDValue();
20228
20229 ShiftAmount = SplatValue.getSExtValue();
20230 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20231 ShiftAmount = CVN->getSExtValue();
20232 } else
20233 return SDValue();
20234
20235 // If the shift amount is zero, remove the shift intrinsic.
20236 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20237 return N->getOperand(1);
20238
20239 unsigned Opcode;
20240 bool IsRightShift;
20241 switch (IID) {
20242 default:
20243 llvm_unreachable("Unknown shift intrinsic");
20244 case Intrinsic::aarch64_neon_sqshl:
20245 Opcode = AArch64ISD::SQSHL_I;
20246 IsRightShift = false;
20247 break;
20248 case Intrinsic::aarch64_neon_uqshl:
20249 Opcode = AArch64ISD::UQSHL_I;
20250 IsRightShift = false;
20251 break;
20252 case Intrinsic::aarch64_neon_srshl:
20253 Opcode = AArch64ISD::SRSHR_I;
20254 IsRightShift = true;
20255 break;
20256 case Intrinsic::aarch64_neon_urshl:
20257 Opcode = AArch64ISD::URSHR_I;
20258 IsRightShift = true;
20259 break;
20260 case Intrinsic::aarch64_neon_sqshlu:
20261 Opcode = AArch64ISD::SQSHLU_I;
20262 IsRightShift = false;
20263 break;
20264 case Intrinsic::aarch64_neon_sshl:
20265 case Intrinsic::aarch64_neon_ushl:
20266 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20267 // left shift for positive shift amounts. For negative shifts we can use a
20268 // VASHR/VLSHR as appropiate.
20269 if (ShiftAmount < 0) {
20270 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20272 ShiftAmount = -ShiftAmount;
20273 } else
20274 Opcode = AArch64ISD::VSHL;
20275 IsRightShift = false;
20276 break;
20277 }
20278
20279 EVT VT = N->getValueType(0);
20280 SDValue Op = N->getOperand(1);
20281 SDLoc dl(N);
20282 if (VT == MVT::i64) {
20283 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20284 VT = MVT::v1i64;
20285 }
20286
20287 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20288 Op = DAG.getNode(Opcode, dl, VT, Op,
20289 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20290 if (N->getValueType(0) == MVT::i64)
20291 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20292 DAG.getConstant(0, dl, MVT::i64));
20293 return Op;
20294 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20295 Op = DAG.getNode(Opcode, dl, VT, Op,
20296 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20297 if (N->getValueType(0) == MVT::i64)
20298 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20299 DAG.getConstant(0, dl, MVT::i64));
20300 return Op;
20301 }
20302
20303 return SDValue();
20304}
20305
20306// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20307// the intrinsics must be legal and take an i32, this means there's almost
20308// certainly going to be a zext in the DAG which we can eliminate.
20309static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20310 SDValue AndN = N->getOperand(2);
20311 if (AndN.getOpcode() != ISD::AND)
20312 return SDValue();
20313
20314 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20315 if (!CMask || CMask->getZExtValue() != Mask)
20316 return SDValue();
20317
20318 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20319 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20320}
20321
20323 SelectionDAG &DAG) {
20324 SDLoc dl(N);
20325 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20326 DAG.getNode(Opc, dl,
20327 N->getOperand(1).getSimpleValueType(),
20328 N->getOperand(1)),
20329 DAG.getConstant(0, dl, MVT::i64));
20330}
20331
20333 SDLoc DL(N);
20334 SDValue Op1 = N->getOperand(1);
20335 SDValue Op2 = N->getOperand(2);
20336 EVT ScalarTy = Op2.getValueType();
20337 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20338 ScalarTy = MVT::i32;
20339
20340 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20341 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20342 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20343 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20344 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20345 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20346}
20347
20349 SDLoc dl(N);
20350 SDValue Scalar = N->getOperand(3);
20351 EVT ScalarTy = Scalar.getValueType();
20352
20353 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20354 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20355
20356 SDValue Passthru = N->getOperand(1);
20357 SDValue Pred = N->getOperand(2);
20358 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20359 Pred, Scalar, Passthru);
20360}
20361
20363 SDLoc dl(N);
20364 LLVMContext &Ctx = *DAG.getContext();
20365 EVT VT = N->getValueType(0);
20366
20367 assert(VT.isScalableVector() && "Expected a scalable vector.");
20368
20369 // Current lowering only supports the SVE-ACLE types.
20371 return SDValue();
20372
20373 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20374 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20375 EVT ByteVT =
20376 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20377
20378 // Convert everything to the domain of EXT (i.e bytes).
20379 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20380 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20381 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20382 DAG.getConstant(ElemSize, dl, MVT::i32));
20383
20384 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20385 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20386}
20387
20390 SelectionDAG &DAG) {
20391 if (DCI.isBeforeLegalize())
20392 return SDValue();
20393
20394 SDValue Comparator = N->getOperand(3);
20395 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20396 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20397 unsigned IID = getIntrinsicID(N);
20398 EVT VT = N->getValueType(0);
20399 EVT CmpVT = N->getOperand(2).getValueType();
20400 SDValue Pred = N->getOperand(1);
20401 SDValue Imm;
20402 SDLoc DL(N);
20403
20404 switch (IID) {
20405 default:
20406 llvm_unreachable("Called with wrong intrinsic!");
20407 break;
20408
20409 // Signed comparisons
20410 case Intrinsic::aarch64_sve_cmpeq_wide:
20411 case Intrinsic::aarch64_sve_cmpne_wide:
20412 case Intrinsic::aarch64_sve_cmpge_wide:
20413 case Intrinsic::aarch64_sve_cmpgt_wide:
20414 case Intrinsic::aarch64_sve_cmplt_wide:
20415 case Intrinsic::aarch64_sve_cmple_wide: {
20416 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20417 int64_t ImmVal = CN->getSExtValue();
20418 if (ImmVal >= -16 && ImmVal <= 15)
20419 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20420 else
20421 return SDValue();
20422 }
20423 break;
20424 }
20425 // Unsigned comparisons
20426 case Intrinsic::aarch64_sve_cmphs_wide:
20427 case Intrinsic::aarch64_sve_cmphi_wide:
20428 case Intrinsic::aarch64_sve_cmplo_wide:
20429 case Intrinsic::aarch64_sve_cmpls_wide: {
20430 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
20431 uint64_t ImmVal = CN->getZExtValue();
20432 if (ImmVal <= 127)
20433 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
20434 else
20435 return SDValue();
20436 }
20437 break;
20438 }
20439 }
20440
20441 if (!Imm)
20442 return SDValue();
20443
20444 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
20445 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
20446 N->getOperand(2), Splat, DAG.getCondCode(CC));
20447 }
20448
20449 return SDValue();
20450}
20451
20454 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20455
20456 SDLoc DL(Op);
20457 assert(Op.getValueType().isScalableVector() &&
20458 TLI.isTypeLegal(Op.getValueType()) &&
20459 "Expected legal scalable vector type!");
20460 assert(Op.getValueType() == Pg.getValueType() &&
20461 "Expected same type for PTEST operands");
20462
20463 // Ensure target specific opcodes are using legal type.
20464 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
20465 SDValue TVal = DAG.getConstant(1, DL, OutVT);
20466 SDValue FVal = DAG.getConstant(0, DL, OutVT);
20467
20468 // Ensure operands have type nxv16i1.
20469 if (Op.getValueType() != MVT::nxv16i1) {
20472 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
20473 else
20474 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
20475 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
20476 }
20477
20478 // Set condition code (CC) flags.
20479 SDValue Test = DAG.getNode(
20481 DL, MVT::Other, Pg, Op);
20482
20483 // Convert CC to integer based on requested condition.
20484 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
20485 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
20486 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
20487 return DAG.getZExtOrTrunc(Res, DL, VT);
20488}
20489
20491 SelectionDAG &DAG) {
20492 SDLoc DL(N);
20493
20494 SDValue Pred = N->getOperand(1);
20495 SDValue VecToReduce = N->getOperand(2);
20496
20497 // NOTE: The integer reduction's result type is not always linked to the
20498 // operand's element type so we construct it from the intrinsic's result type.
20499 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
20500 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20501
20502 // SVE reductions set the whole vector register with the first element
20503 // containing the reduction result, which we'll now extract.
20504 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20505 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20506 Zero);
20507}
20508
20510 SelectionDAG &DAG) {
20511 SDLoc DL(N);
20512
20513 SDValue Pred = N->getOperand(1);
20514 SDValue VecToReduce = N->getOperand(2);
20515
20516 EVT ReduceVT = VecToReduce.getValueType();
20517 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
20518
20519 // SVE reductions set the whole vector register with the first element
20520 // containing the reduction result, which we'll now extract.
20521 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20522 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20523 Zero);
20524}
20525
20527 SelectionDAG &DAG) {
20528 SDLoc DL(N);
20529
20530 SDValue Pred = N->getOperand(1);
20531 SDValue InitVal = N->getOperand(2);
20532 SDValue VecToReduce = N->getOperand(3);
20533 EVT ReduceVT = VecToReduce.getValueType();
20534
20535 // Ordered reductions use the first lane of the result vector as the
20536 // reduction's initial value.
20537 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
20538 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
20539 DAG.getUNDEF(ReduceVT), InitVal, Zero);
20540
20541 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
20542
20543 // SVE reductions set the whole vector register with the first element
20544 // containing the reduction result, which we'll now extract.
20545 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
20546 Zero);
20547}
20548
20549// If a merged operation has no inactive lanes we can relax it to a predicated
20550// or unpredicated operation, which potentially allows better isel (perhaps
20551// using immediate forms) or relaxing register reuse requirements.
20553 SelectionDAG &DAG, bool UnpredOp = false,
20554 bool SwapOperands = false) {
20555 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
20556 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
20557 SDValue Pg = N->getOperand(1);
20558 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
20559 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
20560
20561 // ISD way to specify an all active predicate.
20562 if (isAllActivePredicate(DAG, Pg)) {
20563 if (UnpredOp)
20564 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
20565
20566 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
20567 }
20568
20569 // FUTURE: SplatVector(true)
20570 return SDValue();
20571}
20572
20575 const AArch64Subtarget *Subtarget) {
20576 if (DCI.isBeforeLegalize())
20577 return SDValue();
20578
20579 if (!Subtarget->hasSVE2p1())
20580 return SDValue();
20581
20582 if (!N->hasNUsesOfValue(2, 0))
20583 return SDValue();
20584
20585 const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
20586 if (HalfSize < 2)
20587 return SDValue();
20588
20589 auto It = N->use_begin();
20590 SDNode *Lo = *It++;
20591 SDNode *Hi = *It;
20592
20593 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
20594 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
20595 return SDValue();
20596
20597 uint64_t OffLo = Lo->getConstantOperandVal(1);
20598 uint64_t OffHi = Hi->getConstantOperandVal(1);
20599
20600 if (OffLo > OffHi) {
20601 std::swap(Lo, Hi);
20602 std::swap(OffLo, OffHi);
20603 }
20604
20605 if (OffLo != 0 || OffHi != HalfSize)
20606 return SDValue();
20607
20608 EVT HalfVec = Lo->getValueType(0);
20609 if (HalfVec != Hi->getValueType(0) ||
20610 HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
20611 return SDValue();
20612
20613 SelectionDAG &DAG = DCI.DAG;
20614 SDLoc DL(N);
20615 SDValue ID =
20616 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
20617 SDValue Idx = N->getOperand(1);
20618 SDValue TC = N->getOperand(2);
20619 if (Idx.getValueType() != MVT::i64) {
20620 Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
20621 TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
20622 }
20623 auto R =
20625 {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
20626
20627 DCI.CombineTo(Lo, R.getValue(0));
20628 DCI.CombineTo(Hi, R.getValue(1));
20629
20630 return SDValue(N, 0);
20631}
20632
20635 const AArch64Subtarget *Subtarget) {
20636 SelectionDAG &DAG = DCI.DAG;
20637 unsigned IID = getIntrinsicID(N);
20638 switch (IID) {
20639 default:
20640 break;
20641 case Intrinsic::aarch64_neon_vcvtfxs2fp:
20642 case Intrinsic::aarch64_neon_vcvtfxu2fp:
20643 return tryCombineFixedPointConvert(N, DCI, DAG);
20644 case Intrinsic::aarch64_neon_saddv:
20646 case Intrinsic::aarch64_neon_uaddv:
20648 case Intrinsic::aarch64_neon_sminv:
20650 case Intrinsic::aarch64_neon_uminv:
20652 case Intrinsic::aarch64_neon_smaxv:
20654 case Intrinsic::aarch64_neon_umaxv:
20656 case Intrinsic::aarch64_neon_fmax:
20657 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
20658 N->getOperand(1), N->getOperand(2));
20659 case Intrinsic::aarch64_neon_fmin:
20660 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
20661 N->getOperand(1), N->getOperand(2));
20662 case Intrinsic::aarch64_neon_fmaxnm:
20663 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
20664 N->getOperand(1), N->getOperand(2));
20665 case Intrinsic::aarch64_neon_fminnm:
20666 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
20667 N->getOperand(1), N->getOperand(2));
20668 case Intrinsic::aarch64_neon_smull:
20669 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
20670 N->getOperand(1), N->getOperand(2));
20671 case Intrinsic::aarch64_neon_umull:
20672 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
20673 N->getOperand(1), N->getOperand(2));
20674 case Intrinsic::aarch64_neon_pmull:
20675 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
20676 N->getOperand(1), N->getOperand(2));
20677 case Intrinsic::aarch64_neon_sqdmull:
20678 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
20679 case Intrinsic::aarch64_neon_sqshl:
20680 case Intrinsic::aarch64_neon_uqshl:
20681 case Intrinsic::aarch64_neon_sqshlu:
20682 case Intrinsic::aarch64_neon_srshl:
20683 case Intrinsic::aarch64_neon_urshl:
20684 case Intrinsic::aarch64_neon_sshl:
20685 case Intrinsic::aarch64_neon_ushl:
20686 return tryCombineShiftImm(IID, N, DAG);
20687 case Intrinsic::aarch64_neon_sabd:
20688 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20689 N->getOperand(1), N->getOperand(2));
20690 case Intrinsic::aarch64_neon_uabd:
20691 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20692 N->getOperand(1), N->getOperand(2));
20693 case Intrinsic::aarch64_crc32b:
20694 case Intrinsic::aarch64_crc32cb:
20695 return tryCombineCRC32(0xff, N, DAG);
20696 case Intrinsic::aarch64_crc32h:
20697 case Intrinsic::aarch64_crc32ch:
20698 return tryCombineCRC32(0xffff, N, DAG);
20699 case Intrinsic::aarch64_sve_saddv:
20700 // There is no i64 version of SADDV because the sign is irrelevant.
20701 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
20703 else
20705 case Intrinsic::aarch64_sve_uaddv:
20707 case Intrinsic::aarch64_sve_smaxv:
20709 case Intrinsic::aarch64_sve_umaxv:
20711 case Intrinsic::aarch64_sve_sminv:
20713 case Intrinsic::aarch64_sve_uminv:
20715 case Intrinsic::aarch64_sve_orv:
20717 case Intrinsic::aarch64_sve_eorv:
20719 case Intrinsic::aarch64_sve_andv:
20721 case Intrinsic::aarch64_sve_index:
20722 return LowerSVEIntrinsicIndex(N, DAG);
20723 case Intrinsic::aarch64_sve_dup:
20724 return LowerSVEIntrinsicDUP(N, DAG);
20725 case Intrinsic::aarch64_sve_dup_x:
20726 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
20727 N->getOperand(1));
20728 case Intrinsic::aarch64_sve_ext:
20729 return LowerSVEIntrinsicEXT(N, DAG);
20730 case Intrinsic::aarch64_sve_mul_u:
20731 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
20732 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20733 case Intrinsic::aarch64_sve_smulh_u:
20734 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
20735 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20736 case Intrinsic::aarch64_sve_umulh_u:
20737 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
20738 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20739 case Intrinsic::aarch64_sve_smin_u:
20740 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
20741 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20742 case Intrinsic::aarch64_sve_umin_u:
20743 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
20744 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20745 case Intrinsic::aarch64_sve_smax_u:
20746 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
20747 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20748 case Intrinsic::aarch64_sve_umax_u:
20749 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
20750 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20751 case Intrinsic::aarch64_sve_lsl_u:
20752 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
20753 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20754 case Intrinsic::aarch64_sve_lsr_u:
20755 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
20756 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20757 case Intrinsic::aarch64_sve_asr_u:
20758 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
20759 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20760 case Intrinsic::aarch64_sve_fadd_u:
20761 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
20762 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20763 case Intrinsic::aarch64_sve_fdiv_u:
20764 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
20765 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20766 case Intrinsic::aarch64_sve_fmax_u:
20767 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
20768 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20769 case Intrinsic::aarch64_sve_fmaxnm_u:
20770 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
20771 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20772 case Intrinsic::aarch64_sve_fmla_u:
20773 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
20774 N->getOperand(1), N->getOperand(3), N->getOperand(4),
20775 N->getOperand(2));
20776 case Intrinsic::aarch64_sve_fmin_u:
20777 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
20778 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20779 case Intrinsic::aarch64_sve_fminnm_u:
20780 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
20781 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20782 case Intrinsic::aarch64_sve_fmul_u:
20783 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
20784 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20785 case Intrinsic::aarch64_sve_fsub_u:
20786 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
20787 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20788 case Intrinsic::aarch64_sve_add_u:
20789 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
20790 N->getOperand(3));
20791 case Intrinsic::aarch64_sve_sub_u:
20792 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
20793 N->getOperand(3));
20794 case Intrinsic::aarch64_sve_subr:
20795 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
20796 case Intrinsic::aarch64_sve_and_u:
20797 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
20798 N->getOperand(3));
20799 case Intrinsic::aarch64_sve_bic_u:
20800 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
20801 N->getOperand(2), N->getOperand(3));
20802 case Intrinsic::aarch64_sve_eor_u:
20803 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20804 N->getOperand(3));
20805 case Intrinsic::aarch64_sve_orr_u:
20806 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
20807 N->getOperand(3));
20808 case Intrinsic::aarch64_sve_sabd_u:
20809 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
20810 N->getOperand(2), N->getOperand(3));
20811 case Intrinsic::aarch64_sve_uabd_u:
20812 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
20813 N->getOperand(2), N->getOperand(3));
20814 case Intrinsic::aarch64_sve_sdiv_u:
20815 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
20816 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20817 case Intrinsic::aarch64_sve_udiv_u:
20818 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
20819 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20820 case Intrinsic::aarch64_sve_sqadd:
20821 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
20822 case Intrinsic::aarch64_sve_sqsub_u:
20823 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20824 N->getOperand(2), N->getOperand(3));
20825 case Intrinsic::aarch64_sve_uqadd:
20826 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
20827 case Intrinsic::aarch64_sve_uqsub_u:
20828 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20829 N->getOperand(2), N->getOperand(3));
20830 case Intrinsic::aarch64_sve_sqadd_x:
20831 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
20832 N->getOperand(1), N->getOperand(2));
20833 case Intrinsic::aarch64_sve_sqsub_x:
20834 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
20835 N->getOperand(1), N->getOperand(2));
20836 case Intrinsic::aarch64_sve_uqadd_x:
20837 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
20838 N->getOperand(1), N->getOperand(2));
20839 case Intrinsic::aarch64_sve_uqsub_x:
20840 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
20841 N->getOperand(1), N->getOperand(2));
20842 case Intrinsic::aarch64_sve_asrd:
20843 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
20844 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20845 case Intrinsic::aarch64_sve_cmphs:
20846 if (!N->getOperand(2).getValueType().isFloatingPoint())
20848 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20849 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
20850 break;
20851 case Intrinsic::aarch64_sve_cmphi:
20852 if (!N->getOperand(2).getValueType().isFloatingPoint())
20854 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20855 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
20856 break;
20857 case Intrinsic::aarch64_sve_fcmpge:
20858 case Intrinsic::aarch64_sve_cmpge:
20860 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20861 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
20862 break;
20863 case Intrinsic::aarch64_sve_fcmpgt:
20864 case Intrinsic::aarch64_sve_cmpgt:
20866 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20867 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
20868 break;
20869 case Intrinsic::aarch64_sve_fcmpeq:
20870 case Intrinsic::aarch64_sve_cmpeq:
20872 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20873 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
20874 break;
20875 case Intrinsic::aarch64_sve_fcmpne:
20876 case Intrinsic::aarch64_sve_cmpne:
20878 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20879 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
20880 break;
20881 case Intrinsic::aarch64_sve_fcmpuo:
20883 N->getValueType(0), N->getOperand(1), N->getOperand(2),
20884 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
20885 break;
20886 case Intrinsic::aarch64_sve_fadda:
20888 case Intrinsic::aarch64_sve_faddv:
20890 case Intrinsic::aarch64_sve_fmaxnmv:
20892 case Intrinsic::aarch64_sve_fmaxv:
20894 case Intrinsic::aarch64_sve_fminnmv:
20896 case Intrinsic::aarch64_sve_fminv:
20898 case Intrinsic::aarch64_sve_sel:
20899 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
20900 N->getOperand(1), N->getOperand(2), N->getOperand(3));
20901 case Intrinsic::aarch64_sve_cmpeq_wide:
20902 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
20903 case Intrinsic::aarch64_sve_cmpne_wide:
20904 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
20905 case Intrinsic::aarch64_sve_cmpge_wide:
20906 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
20907 case Intrinsic::aarch64_sve_cmpgt_wide:
20908 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
20909 case Intrinsic::aarch64_sve_cmplt_wide:
20910 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
20911 case Intrinsic::aarch64_sve_cmple_wide:
20912 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
20913 case Intrinsic::aarch64_sve_cmphs_wide:
20914 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
20915 case Intrinsic::aarch64_sve_cmphi_wide:
20916 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
20917 case Intrinsic::aarch64_sve_cmplo_wide:
20918 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
20919 case Intrinsic::aarch64_sve_cmpls_wide:
20920 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
20921 case Intrinsic::aarch64_sve_ptest_any:
20922 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20924 case Intrinsic::aarch64_sve_ptest_first:
20925 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20927 case Intrinsic::aarch64_sve_ptest_last:
20928 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
20930 case Intrinsic::aarch64_sve_whilelo:
20931 return tryCombineWhileLo(N, DCI, Subtarget);
20932 }
20933 return SDValue();
20934}
20935
20936static bool isCheapToExtend(const SDValue &N) {
20937 unsigned OC = N->getOpcode();
20938 return OC == ISD::LOAD || OC == ISD::MLOAD ||
20940}
20941
20942static SDValue
20944 SelectionDAG &DAG) {
20945 // If we have (sext (setcc A B)) and A and B are cheap to extend,
20946 // we can move the sext into the arguments and have the same result. For
20947 // example, if A and B are both loads, we can make those extending loads and
20948 // avoid an extra instruction. This pattern appears often in VLS code
20949 // generation where the inputs to the setcc have a different size to the
20950 // instruction that wants to use the result of the setcc.
20951 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
20952 N->getOperand(0)->getOpcode() == ISD::SETCC);
20953 const SDValue SetCC = N->getOperand(0);
20954
20955 const SDValue CCOp0 = SetCC.getOperand(0);
20956 const SDValue CCOp1 = SetCC.getOperand(1);
20957 if (!CCOp0->getValueType(0).isInteger() ||
20958 !CCOp1->getValueType(0).isInteger())
20959 return SDValue();
20960
20961 ISD::CondCode Code =
20962 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
20963
20964 ISD::NodeType ExtType =
20965 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20966
20967 if (isCheapToExtend(SetCC.getOperand(0)) &&
20968 isCheapToExtend(SetCC.getOperand(1))) {
20969 const SDValue Ext1 =
20970 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
20971 const SDValue Ext2 =
20972 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
20973
20974 return DAG.getSetCC(
20975 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
20976 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
20977 }
20978
20979 return SDValue();
20980}
20981
20984 SelectionDAG &DAG) {
20985 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
20986 // we can convert that DUP into another extract_high (of a bigger DUP), which
20987 // helps the backend to decide that an sabdl2 would be useful, saving a real
20988 // extract_high operation.
20989 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
20990 (N->getOperand(0).getOpcode() == ISD::ABDU ||
20991 N->getOperand(0).getOpcode() == ISD::ABDS)) {
20992 SDNode *ABDNode = N->getOperand(0).getNode();
20993 SDValue NewABD =
20995 if (!NewABD.getNode())
20996 return SDValue();
20997
20998 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
20999 }
21000
21001 if (N->getValueType(0).isFixedLengthVector() &&
21002 N->getOpcode() == ISD::SIGN_EXTEND &&
21003 N->getOperand(0)->getOpcode() == ISD::SETCC)
21004 return performSignExtendSetCCCombine(N, DCI, DAG);
21005
21006 return SDValue();
21007}
21008
21010 SDValue SplatVal, unsigned NumVecElts) {
21011 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
21012 Align OrigAlignment = St.getAlign();
21013 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
21014
21015 // Create scalar stores. This is at least as good as the code sequence for a
21016 // split unaligned store which is a dup.s, ext.b, and two stores.
21017 // Most of the time the three stores should be replaced by store pair
21018 // instructions (stp).
21019 SDLoc DL(&St);
21020 SDValue BasePtr = St.getBasePtr();
21021 uint64_t BaseOffset = 0;
21022
21023 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
21024 SDValue NewST1 =
21025 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
21026 OrigAlignment, St.getMemOperand()->getFlags());
21027
21028 // As this in ISel, we will not merge this add which may degrade results.
21029 if (BasePtr->getOpcode() == ISD::ADD &&
21030 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
21031 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
21032 BasePtr = BasePtr->getOperand(0);
21033 }
21034
21035 unsigned Offset = EltOffset;
21036 while (--NumVecElts) {
21037 Align Alignment = commonAlignment(OrigAlignment, Offset);
21038 SDValue OffsetPtr =
21039 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21040 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
21041 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
21042 PtrInfo.getWithOffset(Offset), Alignment,
21043 St.getMemOperand()->getFlags());
21044 Offset += EltOffset;
21045 }
21046 return NewST1;
21047}
21048
21049// Returns an SVE type that ContentTy can be trivially sign or zero extended
21050// into.
21051static MVT getSVEContainerType(EVT ContentTy) {
21052 assert(ContentTy.isSimple() && "No SVE containers for extended types");
21053
21054 switch (ContentTy.getSimpleVT().SimpleTy) {
21055 default:
21056 llvm_unreachable("No known SVE container for this MVT type");
21057 case MVT::nxv2i8:
21058 case MVT::nxv2i16:
21059 case MVT::nxv2i32:
21060 case MVT::nxv2i64:
21061 case MVT::nxv2f32:
21062 case MVT::nxv2f64:
21063 return MVT::nxv2i64;
21064 case MVT::nxv4i8:
21065 case MVT::nxv4i16:
21066 case MVT::nxv4i32:
21067 case MVT::nxv4f32:
21068 return MVT::nxv4i32;
21069 case MVT::nxv8i8:
21070 case MVT::nxv8i16:
21071 case MVT::nxv8f16:
21072 case MVT::nxv8bf16:
21073 return MVT::nxv8i16;
21074 case MVT::nxv16i8:
21075 return MVT::nxv16i8;
21076 }
21077}
21078
21079static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
21080 SDLoc DL(N);
21081 EVT VT = N->getValueType(0);
21082
21084 return SDValue();
21085
21086 EVT ContainerVT = VT;
21087 if (ContainerVT.isInteger())
21088 ContainerVT = getSVEContainerType(ContainerVT);
21089
21090 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
21091 SDValue Ops[] = { N->getOperand(0), // Chain
21092 N->getOperand(2), // Pg
21093 N->getOperand(3), // Base
21094 DAG.getValueType(VT) };
21095
21096 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
21097 SDValue LoadChain = SDValue(Load.getNode(), 1);
21098
21099 if (ContainerVT.isInteger() && (VT != ContainerVT))
21100 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
21101
21102 return DAG.getMergeValues({ Load, LoadChain }, DL);
21103}
21104
21106 SDLoc DL(N);
21107 EVT VT = N->getValueType(0);
21108 EVT PtrTy = N->getOperand(3).getValueType();
21109
21110 EVT LoadVT = VT;
21111 if (VT.isFloatingPoint())
21112 LoadVT = VT.changeTypeToInteger();
21113
21114 auto *MINode = cast<MemIntrinsicSDNode>(N);
21115 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
21116 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
21117 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
21118 MINode->getOperand(2), PassThru,
21119 MINode->getMemoryVT(), MINode->getMemOperand(),
21121
21122 if (VT.isFloatingPoint()) {
21123 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
21124 return DAG.getMergeValues(Ops, DL);
21125 }
21126
21127 return L;
21128}
21129
21130template <unsigned Opcode>
21132 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21134 "Unsupported opcode.");
21135 SDLoc DL(N);
21136 EVT VT = N->getValueType(0);
21137
21138 EVT LoadVT = VT;
21139 if (VT.isFloatingPoint())
21140 LoadVT = VT.changeTypeToInteger();
21141
21142 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
21143 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
21144 SDValue LoadChain = SDValue(Load.getNode(), 1);
21145
21146 if (VT.isFloatingPoint())
21147 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21148
21149 return DAG.getMergeValues({Load, LoadChain}, DL);
21150}
21151
21153 SDLoc DL(N);
21154 SDValue Data = N->getOperand(2);
21155 EVT DataVT = Data.getValueType();
21156 EVT HwSrcVt = getSVEContainerType(DataVT);
21157 SDValue InputVT = DAG.getValueType(DataVT);
21158
21159 if (DataVT.isFloatingPoint())
21160 InputVT = DAG.getValueType(HwSrcVt);
21161
21162 SDValue SrcNew;
21163 if (Data.getValueType().isFloatingPoint())
21164 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21165 else
21166 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21167
21168 SDValue Ops[] = { N->getOperand(0), // Chain
21169 SrcNew,
21170 N->getOperand(4), // Base
21171 N->getOperand(3), // Pg
21172 InputVT
21173 };
21174
21175 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21176}
21177
21179 SDLoc DL(N);
21180
21181 SDValue Data = N->getOperand(2);
21182 EVT DataVT = Data.getValueType();
21183 EVT PtrTy = N->getOperand(4).getValueType();
21184
21185 if (DataVT.isFloatingPoint())
21187
21188 auto *MINode = cast<MemIntrinsicSDNode>(N);
21189 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21190 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21191 MINode->getMemoryVT(), MINode->getMemOperand(),
21192 ISD::UNINDEXED, false, false);
21193}
21194
21195/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21196/// load store optimizer pass will merge them to store pair stores. This should
21197/// be better than a movi to create the vector zero followed by a vector store
21198/// if the zero constant is not re-used, since one instructions and one register
21199/// live range will be removed.
21200///
21201/// For example, the final generated code should be:
21202///
21203/// stp xzr, xzr, [x0]
21204///
21205/// instead of:
21206///
21207/// movi v0.2d, #0
21208/// str q0, [x0]
21209///
21211 SDValue StVal = St.getValue();
21212 EVT VT = StVal.getValueType();
21213
21214 // Avoid scalarizing zero splat stores for scalable vectors.
21215 if (VT.isScalableVector())
21216 return SDValue();
21217
21218 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21219 // 2, 3 or 4 i32 elements.
21220 int NumVecElts = VT.getVectorNumElements();
21221 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21222 VT.getVectorElementType().getSizeInBits() == 64) ||
21223 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21224 VT.getVectorElementType().getSizeInBits() == 32)))
21225 return SDValue();
21226
21227 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21228 return SDValue();
21229
21230 // If the zero constant has more than one use then the vector store could be
21231 // better since the constant mov will be amortized and stp q instructions
21232 // should be able to be formed.
21233 if (!StVal.hasOneUse())
21234 return SDValue();
21235
21236 // If the store is truncating then it's going down to i16 or smaller, which
21237 // means it can be implemented in a single store anyway.
21238 if (St.isTruncatingStore())
21239 return SDValue();
21240
21241 // If the immediate offset of the address operand is too large for the stp
21242 // instruction, then bail out.
21243 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21244 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21245 if (Offset < -512 || Offset > 504)
21246 return SDValue();
21247 }
21248
21249 for (int I = 0; I < NumVecElts; ++I) {
21250 SDValue EltVal = StVal.getOperand(I);
21251 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21252 return SDValue();
21253 }
21254
21255 // Use a CopyFromReg WZR/XZR here to prevent
21256 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21257 SDLoc DL(&St);
21258 unsigned ZeroReg;
21259 EVT ZeroVT;
21260 if (VT.getVectorElementType().getSizeInBits() == 32) {
21261 ZeroReg = AArch64::WZR;
21262 ZeroVT = MVT::i32;
21263 } else {
21264 ZeroReg = AArch64::XZR;
21265 ZeroVT = MVT::i64;
21266 }
21267 SDValue SplatVal =
21268 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21269 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21270}
21271
21272/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21273/// value. The load store optimizer pass will merge them to store pair stores.
21274/// This has better performance than a splat of the scalar followed by a split
21275/// vector store. Even if the stores are not merged it is four stores vs a dup,
21276/// followed by an ext.b and two stores.
21278 SDValue StVal = St.getValue();
21279 EVT VT = StVal.getValueType();
21280
21281 // Don't replace floating point stores, they possibly won't be transformed to
21282 // stp because of the store pair suppress pass.
21283 if (VT.isFloatingPoint())
21284 return SDValue();
21285
21286 // We can express a splat as store pair(s) for 2 or 4 elements.
21287 unsigned NumVecElts = VT.getVectorNumElements();
21288 if (NumVecElts != 4 && NumVecElts != 2)
21289 return SDValue();
21290
21291 // If the store is truncating then it's going down to i16 or smaller, which
21292 // means it can be implemented in a single store anyway.
21293 if (St.isTruncatingStore())
21294 return SDValue();
21295
21296 // Check that this is a splat.
21297 // Make sure that each of the relevant vector element locations are inserted
21298 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21299 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21300 SDValue SplatVal;
21301 for (unsigned I = 0; I < NumVecElts; ++I) {
21302 // Check for insert vector elements.
21303 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21304 return SDValue();
21305
21306 // Check that same value is inserted at each vector element.
21307 if (I == 0)
21308 SplatVal = StVal.getOperand(1);
21309 else if (StVal.getOperand(1) != SplatVal)
21310 return SDValue();
21311
21312 // Check insert element index.
21313 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21314 if (!CIndex)
21315 return SDValue();
21316 uint64_t IndexVal = CIndex->getZExtValue();
21317 if (IndexVal >= NumVecElts)
21318 return SDValue();
21319 IndexNotInserted.reset(IndexVal);
21320
21321 StVal = StVal.getOperand(0);
21322 }
21323 // Check that all vector element locations were inserted to.
21324 if (IndexNotInserted.any())
21325 return SDValue();
21326
21327 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21328}
21329
21331 SelectionDAG &DAG,
21332 const AArch64Subtarget *Subtarget) {
21333
21334 StoreSDNode *S = cast<StoreSDNode>(N);
21335 if (S->isVolatile() || S->isIndexed())
21336 return SDValue();
21337
21338 SDValue StVal = S->getValue();
21339 EVT VT = StVal.getValueType();
21340
21341 if (!VT.isFixedLengthVector())
21342 return SDValue();
21343
21344 // If we get a splat of zeros, convert this vector store to a store of
21345 // scalars. They will be merged into store pairs of xzr thereby removing one
21346 // instruction and one register.
21347 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21348 return ReplacedZeroSplat;
21349
21350 // FIXME: The logic for deciding if an unaligned store should be split should
21351 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21352 // a call to that function here.
21353
21354 if (!Subtarget->isMisaligned128StoreSlow())
21355 return SDValue();
21356
21357 // Don't split at -Oz.
21359 return SDValue();
21360
21361 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21362 // those up regresses performance on micro-benchmarks and olden/bh.
21363 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21364 return SDValue();
21365
21366 // Split unaligned 16B stores. They are terrible for performance.
21367 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21368 // extensions can use this to mark that it does not want splitting to happen
21369 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21370 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21371 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21372 S->getAlign() <= Align(2))
21373 return SDValue();
21374
21375 // If we get a splat of a scalar convert this vector store to a store of
21376 // scalars. They will be merged into store pairs thereby removing two
21377 // instructions.
21378 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21379 return ReplacedSplat;
21380
21381 SDLoc DL(S);
21382
21383 // Split VT into two.
21384 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21385 unsigned NumElts = HalfVT.getVectorNumElements();
21386 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21387 DAG.getConstant(0, DL, MVT::i64));
21388 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21389 DAG.getConstant(NumElts, DL, MVT::i64));
21390 SDValue BasePtr = S->getBasePtr();
21391 SDValue NewST1 =
21392 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21393 S->getAlign(), S->getMemOperand()->getFlags());
21394 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21395 DAG.getConstant(8, DL, MVT::i64));
21396 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21397 S->getPointerInfo(), S->getAlign(),
21398 S->getMemOperand()->getFlags());
21399}
21400
21402 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
21403
21404 // splice(pg, op1, undef) -> op1
21405 if (N->getOperand(2).isUndef())
21406 return N->getOperand(1);
21407
21408 return SDValue();
21409}
21410
21412 const AArch64Subtarget *Subtarget) {
21413 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
21414 N->getOpcode() == AArch64ISD::UUNPKLO) &&
21415 "Unexpected Opcode!");
21416
21417 // uunpklo/hi undef -> undef
21418 if (N->getOperand(0).isUndef())
21419 return DAG.getUNDEF(N->getValueType(0));
21420
21421 // If this is a masked load followed by an UUNPKLO, fold this into a masked
21422 // extending load. We can do this even if this is already a masked
21423 // {z,}extload.
21424 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
21425 N->getOpcode() == AArch64ISD::UUNPKLO) {
21426 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
21427 SDValue Mask = MLD->getMask();
21428 SDLoc DL(N);
21429
21430 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
21431 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
21432 (MLD->getPassThru()->isUndef() ||
21433 isZerosVector(MLD->getPassThru().getNode()))) {
21434 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21435 unsigned PgPattern = Mask->getConstantOperandVal(0);
21436 EVT VT = N->getValueType(0);
21437
21438 // Ensure we can double the size of the predicate pattern
21439 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
21440 if (NumElts &&
21441 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
21442 Mask =
21443 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
21444 SDValue PassThru = DAG.getConstant(0, DL, VT);
21445 SDValue NewLoad = DAG.getMaskedLoad(
21446 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
21447 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
21449
21450 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
21451
21452 return NewLoad;
21453 }
21454 }
21455 }
21456
21457 return SDValue();
21458}
21459
21461 if (N->getOpcode() != AArch64ISD::UZP1)
21462 return false;
21463 SDValue Op0 = N->getOperand(0);
21464 EVT SrcVT = Op0->getValueType(0);
21465 EVT DstVT = N->getValueType(0);
21466 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
21467 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
21468 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
21469}
21470
21471// Try to combine rounding shifts where the operands come from an extend, and
21472// the result is truncated and combined into one vector.
21473// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
21475 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
21476 SDValue Op0 = N->getOperand(0);
21477 SDValue Op1 = N->getOperand(1);
21478 EVT ResVT = N->getValueType(0);
21479
21480 unsigned RshOpc = Op0.getOpcode();
21481 if (RshOpc != AArch64ISD::RSHRNB_I)
21482 return SDValue();
21483
21484 // Same op code and imm value?
21485 SDValue ShiftValue = Op0.getOperand(1);
21486 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
21487 return SDValue();
21488
21489 // Same unextended operand value?
21490 SDValue Lo = Op0.getOperand(0);
21491 SDValue Hi = Op1.getOperand(0);
21492 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
21493 Hi.getOpcode() != AArch64ISD::UUNPKHI)
21494 return SDValue();
21495 SDValue OrigArg = Lo.getOperand(0);
21496 if (OrigArg != Hi.getOperand(0))
21497 return SDValue();
21498
21499 SDLoc DL(N);
21500 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
21501 getPredicateForVector(DAG, DL, ResVT), OrigArg,
21502 ShiftValue);
21503}
21504
21505// Try to simplify:
21506// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
21507// t2 = nxv8i16 srl(t1, ShiftValue)
21508// to
21509// t1 = nxv8i16 rshrnb(X, shiftvalue).
21510// rshrnb will zero the top half bits of each element. Therefore, this combine
21511// should only be performed when a following instruction with the rshrnb
21512// as an operand does not care about the top half of each element. For example,
21513// a uzp1 or a truncating store.
21515 const AArch64Subtarget *Subtarget) {
21516 EVT VT = Srl->getValueType(0);
21517 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
21518 return SDValue();
21519
21520 EVT ResVT;
21521 if (VT == MVT::nxv8i16)
21522 ResVT = MVT::nxv16i8;
21523 else if (VT == MVT::nxv4i32)
21524 ResVT = MVT::nxv8i16;
21525 else if (VT == MVT::nxv2i64)
21526 ResVT = MVT::nxv4i32;
21527 else
21528 return SDValue();
21529
21530 SDLoc DL(Srl);
21531 unsigned ShiftValue;
21532 SDValue RShOperand;
21533 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
21534 return SDValue();
21535 SDValue Rshrnb = DAG.getNode(
21536 AArch64ISD::RSHRNB_I, DL, ResVT,
21537 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
21538 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
21539}
21540
21542 const AArch64Subtarget *Subtarget) {
21543 SDLoc DL(N);
21544 SDValue Op0 = N->getOperand(0);
21545 SDValue Op1 = N->getOperand(1);
21546 EVT ResVT = N->getValueType(0);
21547
21548 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
21549 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21551 Op0.getOperand(0) == Op1.getOperand(0)) {
21552
21553 SDValue SourceVec = Op0.getOperand(0);
21554 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
21555 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
21556 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
21557 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
21558 EVT OpVT = Op0.getOperand(1).getValueType();
21559 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21560 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
21561 DAG.getUNDEF(WidenedResVT));
21562 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
21563 DAG.getConstant(0, DL, OpVT));
21564 }
21565 }
21566
21567 // Following optimizations only work with uzp1.
21568 if (N->getOpcode() == AArch64ISD::UZP2)
21569 return SDValue();
21570
21571 // uzp1(x, undef) -> concat(truncate(x), undef)
21572 if (Op1.getOpcode() == ISD::UNDEF) {
21573 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
21574 switch (ResVT.getSimpleVT().SimpleTy) {
21575 default:
21576 break;
21577 case MVT::v16i8:
21578 BCVT = MVT::v8i16;
21579 HalfVT = MVT::v8i8;
21580 break;
21581 case MVT::v8i16:
21582 BCVT = MVT::v4i32;
21583 HalfVT = MVT::v4i16;
21584 break;
21585 case MVT::v4i32:
21586 BCVT = MVT::v2i64;
21587 HalfVT = MVT::v2i32;
21588 break;
21589 }
21590 if (BCVT != MVT::Other) {
21591 SDValue BC = DAG.getBitcast(BCVT, Op0);
21592 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
21593 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
21594 DAG.getUNDEF(HalfVT));
21595 }
21596 }
21597
21598 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
21599 return Urshr;
21600
21601 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
21602 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
21603
21604 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
21605 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
21606
21607 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
21608 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
21609 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21610 SDValue X = Op0.getOperand(0).getOperand(0);
21611 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
21612 }
21613 }
21614
21615 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
21616 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
21617 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
21618 SDValue Z = Op1.getOperand(0).getOperand(1);
21619 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
21620 }
21621 }
21622
21623 // These optimizations only work on little endian.
21624 if (!DAG.getDataLayout().isLittleEndian())
21625 return SDValue();
21626
21627 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
21628 // Example:
21629 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
21630 // to
21631 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
21633 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
21634 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
21635 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
21636 Op1.getOperand(0));
21637 }
21638 }
21639
21640 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
21641 return SDValue();
21642
21643 SDValue SourceOp0 = peekThroughBitcasts(Op0);
21644 SDValue SourceOp1 = peekThroughBitcasts(Op1);
21645
21646 // truncating uzp1(x, y) -> xtn(concat (x, y))
21647 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
21648 EVT Op0Ty = SourceOp0.getValueType();
21649 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
21650 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
21651 SDValue Concat =
21654 SourceOp0, SourceOp1);
21655 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
21656 }
21657 }
21658
21659 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
21660 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
21661 SourceOp1.getOpcode() != ISD::TRUNCATE)
21662 return SDValue();
21663 SourceOp0 = SourceOp0.getOperand(0);
21664 SourceOp1 = SourceOp1.getOperand(0);
21665
21666 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
21667 !SourceOp0.getValueType().isSimple())
21668 return SDValue();
21669
21670 EVT ResultTy;
21671
21672 switch (SourceOp0.getSimpleValueType().SimpleTy) {
21673 case MVT::v2i64:
21674 ResultTy = MVT::v4i32;
21675 break;
21676 case MVT::v4i32:
21677 ResultTy = MVT::v8i16;
21678 break;
21679 case MVT::v8i16:
21680 ResultTy = MVT::v16i8;
21681 break;
21682 default:
21683 return SDValue();
21684 }
21685
21686 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
21687 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
21688 SDValue UzpResult =
21689 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
21690
21691 EVT BitcastResultTy;
21692
21693 switch (ResVT.getSimpleVT().SimpleTy) {
21694 case MVT::v2i32:
21695 BitcastResultTy = MVT::v2i64;
21696 break;
21697 case MVT::v4i16:
21698 BitcastResultTy = MVT::v4i32;
21699 break;
21700 case MVT::v8i8:
21701 BitcastResultTy = MVT::v8i16;
21702 break;
21703 default:
21704 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
21705 }
21706
21707 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
21708 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
21709}
21710
21712 unsigned Opc = N->getOpcode();
21713
21714 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
21716 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
21718 "Invalid opcode.");
21719
21720 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
21722 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
21724 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
21728
21729 SDLoc DL(N);
21730 SDValue Chain = N->getOperand(0);
21731 SDValue Pg = N->getOperand(1);
21732 SDValue Base = N->getOperand(2);
21733 SDValue Offset = N->getOperand(3);
21734 SDValue Ty = N->getOperand(4);
21735
21736 EVT ResVT = N->getValueType(0);
21737
21738 const auto OffsetOpc = Offset.getOpcode();
21739 const bool OffsetIsZExt =
21741 const bool OffsetIsSExt =
21743
21744 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
21745 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
21746 SDValue ExtPg = Offset.getOperand(0);
21747 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
21748 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
21749
21750 // If the predicate for the sign- or zero-extended offset is the
21751 // same as the predicate used for this load and the sign-/zero-extension
21752 // was from a 32-bits...
21753 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
21754 SDValue UnextendedOffset = Offset.getOperand(1);
21755
21756 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
21757 if (Signed)
21758 NewOpc = getSignExtendedGatherOpcode(NewOpc);
21759
21760 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
21761 {Chain, Pg, Base, UnextendedOffset, Ty});
21762 }
21763 }
21764
21765 return SDValue();
21766}
21767
21768/// Optimize a vector shift instruction and its operand if shifted out
21769/// bits are not used.
21771 const AArch64TargetLowering &TLI,
21773 assert(N->getOpcode() == AArch64ISD::VASHR ||
21774 N->getOpcode() == AArch64ISD::VLSHR);
21775
21776 SDValue Op = N->getOperand(0);
21777 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
21778
21779 unsigned ShiftImm = N->getConstantOperandVal(1);
21780 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
21781
21782 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
21783 if (N->getOpcode() == AArch64ISD::VASHR &&
21784 Op.getOpcode() == AArch64ISD::VSHL &&
21785 N->getOperand(1) == Op.getOperand(1))
21786 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
21787 return Op.getOperand(0);
21788
21789 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
21790 APInt DemandedMask = ~ShiftedOutBits;
21791
21792 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
21793 return SDValue(N, 0);
21794
21795 return SDValue();
21796}
21797
21799 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
21800 // This transform works in partnership with performSetCCPunpkCombine to
21801 // remove unnecessary transfer of predicates into standard registers and back
21802 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
21803 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
21804 MVT::i1) {
21805 SDValue CC = N->getOperand(0)->getOperand(0);
21806 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
21807 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
21808 DAG.getVectorIdxConstant(0, SDLoc(N)));
21809 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
21810 }
21811
21812 return SDValue();
21813}
21814
21815/// Target-specific DAG combine function for post-increment LD1 (lane) and
21816/// post-increment LD1R.
21819 bool IsLaneOp) {
21820 if (DCI.isBeforeLegalizeOps())
21821 return SDValue();
21822
21823 SelectionDAG &DAG = DCI.DAG;
21824 EVT VT = N->getValueType(0);
21825
21826 if (!VT.is128BitVector() && !VT.is64BitVector())
21827 return SDValue();
21828
21829 unsigned LoadIdx = IsLaneOp ? 1 : 0;
21830 SDNode *LD = N->getOperand(LoadIdx).getNode();
21831 // If it is not LOAD, can not do such combine.
21832 if (LD->getOpcode() != ISD::LOAD)
21833 return SDValue();
21834
21835 // The vector lane must be a constant in the LD1LANE opcode.
21836 SDValue Lane;
21837 if (IsLaneOp) {
21838 Lane = N->getOperand(2);
21839 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
21840 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
21841 return SDValue();
21842 }
21843
21844 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
21845 EVT MemVT = LoadSDN->getMemoryVT();
21846 // Check if memory operand is the same type as the vector element.
21847 if (MemVT != VT.getVectorElementType())
21848 return SDValue();
21849
21850 // Check if there are other uses. If so, do not combine as it will introduce
21851 // an extra load.
21852 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
21853 ++UI) {
21854 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
21855 continue;
21856 if (*UI != N)
21857 return SDValue();
21858 }
21859
21860 // If there is one use and it can splat the value, prefer that operation.
21861 // TODO: This could be expanded to more operations if they reliably use the
21862 // index variants.
21863 if (N->hasOneUse()) {
21864 unsigned UseOpc = N->use_begin()->getOpcode();
21865 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
21866 return SDValue();
21867 }
21868
21869 SDValue Addr = LD->getOperand(1);
21870 SDValue Vector = N->getOperand(0);
21871 // Search for a use of the address operand that is an increment.
21872 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
21873 Addr.getNode()->use_end(); UI != UE; ++UI) {
21874 SDNode *User = *UI;
21875 if (User->getOpcode() != ISD::ADD
21876 || UI.getUse().getResNo() != Addr.getResNo())
21877 continue;
21878
21879 // If the increment is a constant, it must match the memory ref size.
21880 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
21881 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
21882 uint32_t IncVal = CInc->getZExtValue();
21883 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
21884 if (IncVal != NumBytes)
21885 continue;
21886 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
21887 }
21888
21889 // To avoid cycle construction make sure that neither the load nor the add
21890 // are predecessors to each other or the Vector.
21893 Visited.insert(Addr.getNode());
21894 Worklist.push_back(User);
21895 Worklist.push_back(LD);
21896 Worklist.push_back(Vector.getNode());
21897 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
21898 SDNode::hasPredecessorHelper(User, Visited, Worklist))
21899 continue;
21900
21902 Ops.push_back(LD->getOperand(0)); // Chain
21903 if (IsLaneOp) {
21904 Ops.push_back(Vector); // The vector to be inserted
21905 Ops.push_back(Lane); // The lane to be inserted in the vector
21906 }
21907 Ops.push_back(Addr);
21908 Ops.push_back(Inc);
21909
21910 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
21911 SDVTList SDTys = DAG.getVTList(Tys);
21912 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
21913 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
21914 MemVT,
21915 LoadSDN->getMemOperand());
21916
21917 // Update the uses.
21918 SDValue NewResults[] = {
21919 SDValue(LD, 0), // The result of load
21920 SDValue(UpdN.getNode(), 2) // Chain
21921 };
21922 DCI.CombineTo(LD, NewResults);
21923 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
21924 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
21925
21926 break;
21927 }
21928 return SDValue();
21929}
21930
21931/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
21932/// address translation.
21935 SelectionDAG &DAG) {
21936 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
21937 KnownBits Known;
21939 !DCI.isBeforeLegalizeOps());
21940 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21941 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
21942 DCI.CommitTargetLoweringOpt(TLO);
21943 return true;
21944 }
21945 return false;
21946}
21947
21949 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
21950 "Expected STORE dag node in input!");
21951
21952 if (auto Store = dyn_cast<StoreSDNode>(N)) {
21953 if (!Store->isTruncatingStore() || Store->isIndexed())
21954 return SDValue();
21955 SDValue Ext = Store->getValue();
21956 auto ExtOpCode = Ext.getOpcode();
21957 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
21958 ExtOpCode != ISD::ANY_EXTEND)
21959 return SDValue();
21960 SDValue Orig = Ext->getOperand(0);
21961 if (Store->getMemoryVT() != Orig.getValueType())
21962 return SDValue();
21963 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
21964 Store->getBasePtr(), Store->getMemOperand());
21965 }
21966
21967 return SDValue();
21968}
21969
21970// A custom combine to lower load <3 x i8> as the more efficient sequence
21971// below:
21972// ldrb wX, [x0, #2]
21973// ldrh wY, [x0]
21974// orr wX, wY, wX, lsl #16
21975// fmov s0, wX
21976//
21977// Note that an alternative sequence with even fewer (although usually more
21978// complex/expensive) instructions would be:
21979// ld1r.4h { v0 }, [x0], #2
21980// ld1.b { v0 }[2], [x0]
21981//
21982// Generating this sequence unfortunately results in noticeably worse codegen
21983// for code that extends the loaded v3i8, due to legalization breaking vector
21984// shuffle detection in a way that is very difficult to work around.
21985// TODO: Revisit once v3i8 legalization has been improved in general.
21987 EVT MemVT = LD->getMemoryVT();
21988 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21989 LD->getOriginalAlign() >= 4)
21990 return SDValue();
21991
21992 SDLoc DL(LD);
21994 SDValue Chain = LD->getChain();
21995 SDValue BasePtr = LD->getBasePtr();
21996 MachineMemOperand *MMO = LD->getMemOperand();
21997 assert(LD->getOffset().isUndef() && "undef offset expected");
21998
21999 // Load 2 x i8, then 1 x i8.
22000 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
22001 TypeSize Offset2 = TypeSize::getFixed(2);
22002 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
22003 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
22004 MF.getMachineMemOperand(MMO, 2, 1));
22005
22006 // Extend to i32.
22007 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
22008 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
22009
22010 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
22011 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
22012 DAG.getConstant(16, DL, MVT::i32));
22013 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
22014 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
22015
22016 // Extract v3i8 again.
22017 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
22018 DAG.getConstant(0, DL, MVT::i64));
22019 SDValue TokenFactor = DAG.getNode(
22020 ISD::TokenFactor, DL, MVT::Other,
22021 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
22022 return DAG.getMergeValues({Extract, TokenFactor}, DL);
22023}
22024
22025// Perform TBI simplification if supported by the target and try to break up
22026// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
22027// load instructions can be selected.
22030 SelectionDAG &DAG,
22031 const AArch64Subtarget *Subtarget) {
22032 if (Subtarget->supportsAddressTopByteIgnored())
22033 performTBISimplification(N->getOperand(1), DCI, DAG);
22034
22035 LoadSDNode *LD = cast<LoadSDNode>(N);
22036 if (LD->isVolatile() || !Subtarget->isLittleEndian())
22037 return SDValue(N, 0);
22038
22039 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
22040 return Res;
22041
22042 if (!LD->isNonTemporal())
22043 return SDValue(N, 0);
22044
22045 EVT MemVT = LD->getMemoryVT();
22046 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
22047 MemVT.getSizeInBits() % 256 == 0 ||
22048 256 % MemVT.getScalarSizeInBits() != 0)
22049 return SDValue(N, 0);
22050
22051 SDLoc DL(LD);
22052 SDValue Chain = LD->getChain();
22053 SDValue BasePtr = LD->getBasePtr();
22054 SDNodeFlags Flags = LD->getFlags();
22056 SmallVector<SDValue, 4> LoadOpsChain;
22057 // Replace any non temporal load over 256-bit with a series of 256 bit loads
22058 // and a scalar/vector load less than 256. This way we can utilize 256-bit
22059 // loads and reduce the amount of load instructions generated.
22060 MVT NewVT =
22062 256 / MemVT.getVectorElementType().getSizeInBits());
22063 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
22064 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
22065 for (unsigned I = 0; I < Num256Loads; I++) {
22066 unsigned PtrOffset = I * 32;
22067 SDValue NewPtr = DAG.getMemBasePlusOffset(
22068 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22069 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22070 SDValue NewLoad = DAG.getLoad(
22071 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
22072 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
22073 LoadOps.push_back(NewLoad);
22074 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
22075 }
22076
22077 // Process remaining bits of the load operation.
22078 // This is done by creating an UNDEF vector to match the size of the
22079 // 256-bit loads and inserting the remaining load to it. We extract the
22080 // original load type at the end using EXTRACT_SUBVECTOR instruction.
22081 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
22082 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
22083 MVT RemainingVT = MVT::getVectorVT(
22085 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
22086 SDValue NewPtr = DAG.getMemBasePlusOffset(
22087 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22088 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22089 SDValue RemainingLoad =
22090 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
22091 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
22092 LD->getMemOperand()->getFlags(), LD->getAAInfo());
22093 SDValue UndefVector = DAG.getUNDEF(NewVT);
22094 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
22095 SDValue ExtendedReminingLoad =
22096 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
22097 {UndefVector, RemainingLoad, InsertIdx});
22098 LoadOps.push_back(ExtendedReminingLoad);
22099 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
22100 EVT ConcatVT =
22102 LoadOps.size() * NewVT.getVectorNumElements());
22103 SDValue ConcatVectors =
22104 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
22105 // Extract the original vector type size.
22106 SDValue ExtractSubVector =
22107 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
22108 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
22109 SDValue TokenFactor =
22110 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
22111 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
22112}
22113
22115 EVT VecVT = Op.getValueType();
22116 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22117 "Need boolean vector type.");
22118
22119 if (Depth > 3)
22121
22122 // We can get the base type from a vector compare or truncate.
22123 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22124 return Op.getOperand(0).getValueType();
22125
22126 // If an operand is a bool vector, continue looking.
22128 for (SDValue Operand : Op->op_values()) {
22129 if (Operand.getValueType() != VecVT)
22130 continue;
22131
22132 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
22133 if (!BaseVT.isSimple())
22134 BaseVT = OperandVT;
22135 else if (OperandVT != BaseVT)
22137 }
22138
22139 return BaseVT;
22140}
22141
22142// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22143// iN, we can use a trick that extracts the i^th bit from the i^th element and
22144// then performs a vector add to get a scalar bitmask. This requires that each
22145// element's bits are either all 1 or all 0.
22147 SDLoc DL(N);
22148 SDValue ComparisonResult(N, 0);
22149 EVT VecVT = ComparisonResult.getValueType();
22150 assert(VecVT.isVector() && "Must be a vector type");
22151
22152 unsigned NumElts = VecVT.getVectorNumElements();
22153 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22154 return SDValue();
22155
22156 if (VecVT.getVectorElementType() != MVT::i1 &&
22157 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
22158 return SDValue();
22159
22160 // If we can find the original types to work on instead of a vector of i1,
22161 // we can avoid extend/extract conversion instructions.
22162 if (VecVT.getVectorElementType() == MVT::i1) {
22163 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
22164 if (!VecVT.isSimple()) {
22165 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
22166 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
22167 }
22168 }
22169 VecVT = VecVT.changeVectorElementTypeToInteger();
22170
22171 // Large vectors don't map directly to this conversion, so to avoid too many
22172 // edge cases, we don't apply it here. The conversion will likely still be
22173 // applied later via multiple smaller vectors, whose results are concatenated.
22174 if (VecVT.getSizeInBits() > 128)
22175 return SDValue();
22176
22177 // Ensure that all elements' bits are either 0s or 1s.
22178 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22179
22180 SmallVector<SDValue, 16> MaskConstants;
22181 if (VecVT == MVT::v16i8) {
22182 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22183 // per entry. We split it into two halves, apply the mask, zip the halves to
22184 // create 8x 16-bit values, and the perform the vector reduce.
22185 for (unsigned Half = 0; Half < 2; ++Half) {
22186 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22187 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22188 }
22189 }
22190 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22191 SDValue RepresentativeBits =
22192 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22193
22194 SDValue UpperRepresentativeBits =
22195 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22196 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22197 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22198 RepresentativeBits, UpperRepresentativeBits);
22199 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22200 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22201 }
22202
22203 // All other vector sizes.
22204 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22205 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22206 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22207 }
22208
22209 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22210 SDValue RepresentativeBits =
22211 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22212 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22213 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22214 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22215}
22216
22218 StoreSDNode *Store) {
22219 if (!Store->isTruncatingStore())
22220 return SDValue();
22221
22222 SDLoc DL(Store);
22223 SDValue VecOp = Store->getValue();
22224 EVT VT = VecOp.getValueType();
22225 EVT MemVT = Store->getMemoryVT();
22226
22227 if (!MemVT.isVector() || !VT.isVector() ||
22228 MemVT.getVectorElementType() != MVT::i1)
22229 return SDValue();
22230
22231 // If we are storing a vector that we are currently building, let
22232 // `scalarizeVectorStore()` handle this more efficiently.
22233 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22234 return SDValue();
22235
22236 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22237 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22238 if (!VectorBits)
22239 return SDValue();
22240
22241 EVT StoreVT =
22243 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22244 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22245 Store->getMemOperand());
22246}
22247
22249 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22250 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22251 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22252}
22253
22254// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22256 const AArch64Subtarget *Subtarget) {
22257 SDValue Value = ST->getValue();
22258 EVT ValueVT = Value.getValueType();
22259
22260 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22261 Value.getOpcode() != ISD::TRUNCATE ||
22262 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22263 return SDValue();
22264
22265 assert(ST->getOffset().isUndef() && "undef offset expected");
22266 SDLoc DL(ST);
22267 auto WideVT = EVT::getVectorVT(
22268 *DAG.getContext(),
22269 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22270 SDValue UndefVector = DAG.getUNDEF(WideVT);
22271 SDValue WideTrunc = DAG.getNode(
22272 ISD::INSERT_SUBVECTOR, DL, WideVT,
22273 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22274 SDValue Cast = DAG.getNode(
22275 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22276 WideTrunc);
22277
22279 SDValue Chain = ST->getChain();
22280 MachineMemOperand *MMO = ST->getMemOperand();
22281 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22282 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22283 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22284 TypeSize Offset2 = TypeSize::getFixed(2);
22285 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22286 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22287
22288 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22289 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22290 TypeSize Offset1 = TypeSize::getFixed(1);
22291 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22292 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22293
22294 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22295 DAG.getConstant(0, DL, MVT::i64));
22296 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22297 MF.getMachineMemOperand(MMO, 0, 1));
22298 return Chain;
22299}
22300
22303 SelectionDAG &DAG,
22304 const AArch64Subtarget *Subtarget) {
22305 StoreSDNode *ST = cast<StoreSDNode>(N);
22306 SDValue Chain = ST->getChain();
22307 SDValue Value = ST->getValue();
22308 SDValue Ptr = ST->getBasePtr();
22309 EVT ValueVT = Value.getValueType();
22310
22311 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22312 EVT EltVT = VT.getVectorElementType();
22313 return EltVT == MVT::f32 || EltVT == MVT::f64;
22314 };
22315
22316 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22317 return Res;
22318
22319 // If this is an FP_ROUND followed by a store, fold this into a truncating
22320 // store. We can do this even if this is already a truncstore.
22321 // We purposefully don't care about legality of the nodes here as we know
22322 // they can be split down into something legal.
22323 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22324 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22325 Subtarget->useSVEForFixedLengthVectors() &&
22326 ValueVT.isFixedLengthVector() &&
22327 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22328 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22329 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22330 ST->getMemoryVT(), ST->getMemOperand());
22331
22332 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22333 return Split;
22334
22335 if (Subtarget->supportsAddressTopByteIgnored() &&
22336 performTBISimplification(N->getOperand(2), DCI, DAG))
22337 return SDValue(N, 0);
22338
22339 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22340 return Store;
22341
22342 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22343 return Store;
22344
22345 if (ST->isTruncatingStore()) {
22346 EVT StoreVT = ST->getMemoryVT();
22347 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22348 return SDValue();
22349 if (SDValue Rshrnb =
22350 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22351 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22352 StoreVT, ST->getMemOperand());
22353 }
22354 }
22355
22356 return SDValue();
22357}
22358
22361 SelectionDAG &DAG,
22362 const AArch64Subtarget *Subtarget) {
22363 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22364 SDValue Value = MST->getValue();
22365 SDValue Mask = MST->getMask();
22366 SDLoc DL(N);
22367
22368 // If this is a UZP1 followed by a masked store, fold this into a masked
22369 // truncating store. We can do this even if this is already a masked
22370 // truncstore.
22371 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22372 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22373 Value.getValueType().isInteger()) {
22374 Value = Value.getOperand(0);
22375 if (Value.getOpcode() == ISD::BITCAST) {
22376 EVT HalfVT =
22377 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22378 EVT InVT = Value.getOperand(0).getValueType();
22379
22380 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22381 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22382 unsigned PgPattern = Mask->getConstantOperandVal(0);
22383
22384 // Ensure we can double the size of the predicate pattern
22385 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22386 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22387 MinSVESize) {
22388 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22389 PgPattern);
22390 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22391 MST->getBasePtr(), MST->getOffset(), Mask,
22392 MST->getMemoryVT(), MST->getMemOperand(),
22393 MST->getAddressingMode(),
22394 /*IsTruncating=*/true);
22395 }
22396 }
22397 }
22398 }
22399
22400 if (MST->isTruncatingStore()) {
22401 EVT ValueVT = Value->getValueType(0);
22402 EVT MemVT = MST->getMemoryVT();
22403 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
22404 return SDValue();
22405 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
22406 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
22407 MST->getOffset(), MST->getMask(),
22408 MST->getMemoryVT(), MST->getMemOperand(),
22409 MST->getAddressingMode(), true);
22410 }
22411 }
22412
22413 return SDValue();
22414}
22415
22416/// \return true if part of the index was folded into the Base.
22417static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
22418 SDLoc DL, SelectionDAG &DAG) {
22419 // This function assumes a vector of i64 indices.
22420 EVT IndexVT = Index.getValueType();
22421 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
22422 return false;
22423
22424 // Simplify:
22425 // BasePtr = Ptr
22426 // Index = X + splat(Offset)
22427 // ->
22428 // BasePtr = Ptr + Offset * scale.
22429 // Index = X
22430 if (Index.getOpcode() == ISD::ADD) {
22431 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
22432 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22433 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22434 Index = Index.getOperand(0);
22435 return true;
22436 }
22437 }
22438
22439 // Simplify:
22440 // BasePtr = Ptr
22441 // Index = (X + splat(Offset)) << splat(Shift)
22442 // ->
22443 // BasePtr = Ptr + (Offset << Shift) * scale)
22444 // Index = X << splat(shift)
22445 if (Index.getOpcode() == ISD::SHL &&
22446 Index.getOperand(0).getOpcode() == ISD::ADD) {
22447 SDValue Add = Index.getOperand(0);
22448 SDValue ShiftOp = Index.getOperand(1);
22449 SDValue OffsetOp = Add.getOperand(1);
22450 if (auto Shift = DAG.getSplatValue(ShiftOp))
22451 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
22452 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
22453 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
22454 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
22455 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
22456 Add.getOperand(0), ShiftOp);
22457 return true;
22458 }
22459 }
22460
22461 return false;
22462}
22463
22464// Analyse the specified address returning true if a more optimal addressing
22465// mode is available. When returning true all parameters are updated to reflect
22466// their recommended values.
22468 SDValue &BasePtr, SDValue &Index,
22469 SelectionDAG &DAG) {
22470 // Try to iteratively fold parts of the index into the base pointer to
22471 // simplify the index as much as possible.
22472 bool Changed = false;
22473 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
22474 Changed = true;
22475
22476 // Only consider element types that are pointer sized as smaller types can
22477 // be easily promoted.
22478 EVT IndexVT = Index.getValueType();
22479 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
22480 return Changed;
22481
22482 // Can indices be trivially shrunk?
22483 EVT DataVT = N->getOperand(1).getValueType();
22484 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
22485 // will later be re-extended to 64 bits in legalization
22486 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
22487 return Changed;
22488 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
22489 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22490 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
22491 return true;
22492 }
22493
22494 // Match:
22495 // Index = step(const)
22496 int64_t Stride = 0;
22497 if (Index.getOpcode() == ISD::STEP_VECTOR) {
22498 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
22499 }
22500 // Match:
22501 // Index = step(const) << shift(const)
22502 else if (Index.getOpcode() == ISD::SHL &&
22503 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
22504 SDValue RHS = Index.getOperand(1);
22505 if (auto *Shift =
22506 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
22507 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
22508 Stride = Step << Shift->getZExtValue();
22509 }
22510 }
22511
22512 // Return early because no supported pattern is found.
22513 if (Stride == 0)
22514 return Changed;
22515
22516 if (Stride < std::numeric_limits<int32_t>::min() ||
22517 Stride > std::numeric_limits<int32_t>::max())
22518 return Changed;
22519
22520 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
22521 unsigned MaxVScale =
22523 int64_t LastElementOffset =
22524 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
22525
22526 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
22527 LastElementOffset > std::numeric_limits<int32_t>::max())
22528 return Changed;
22529
22530 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
22531 // Stride does not scale explicitly by 'Scale', because it happens in
22532 // the gather/scatter addressing mode.
22533 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
22534 return true;
22535}
22536
22539 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
22540 assert(MGS && "Can only combine gather load or scatter store nodes");
22541
22542 if (!DCI.isBeforeLegalize())
22543 return SDValue();
22544
22545 SDLoc DL(MGS);
22546 SDValue Chain = MGS->getChain();
22547 SDValue Scale = MGS->getScale();
22548 SDValue Index = MGS->getIndex();
22549 SDValue Mask = MGS->getMask();
22550 SDValue BasePtr = MGS->getBasePtr();
22551 ISD::MemIndexType IndexType = MGS->getIndexType();
22552
22553 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
22554 return SDValue();
22555
22556 // Here we catch such cases early and change MGATHER's IndexType to allow
22557 // the use of an Index that's more legalisation friendly.
22558 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
22559 SDValue PassThru = MGT->getPassThru();
22560 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
22561 return DAG.getMaskedGather(
22562 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
22563 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
22564 }
22565 auto *MSC = cast<MaskedScatterSDNode>(MGS);
22566 SDValue Data = MSC->getValue();
22567 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
22568 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
22569 Ops, MSC->getMemOperand(), IndexType,
22570 MSC->isTruncatingStore());
22571}
22572
22573/// Target-specific DAG combine function for NEON load/store intrinsics
22574/// to merge base address updates.
22577 SelectionDAG &DAG) {
22578 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
22579 return SDValue();
22580
22581 unsigned AddrOpIdx = N->getNumOperands() - 1;
22582 SDValue Addr = N->getOperand(AddrOpIdx);
22583
22584 // Search for a use of the address operand that is an increment.
22585 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
22586 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
22587 SDNode *User = *UI;
22588 if (User->getOpcode() != ISD::ADD ||
22589 UI.getUse().getResNo() != Addr.getResNo())
22590 continue;
22591
22592 // Check that the add is independent of the load/store. Otherwise, folding
22593 // it would create a cycle.
22596 Visited.insert(Addr.getNode());
22597 Worklist.push_back(N);
22598 Worklist.push_back(User);
22599 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
22600 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22601 continue;
22602
22603 // Find the new opcode for the updating load/store.
22604 bool IsStore = false;
22605 bool IsLaneOp = false;
22606 bool IsDupOp = false;
22607 unsigned NewOpc = 0;
22608 unsigned NumVecs = 0;
22609 unsigned IntNo = N->getConstantOperandVal(1);
22610 switch (IntNo) {
22611 default: llvm_unreachable("unexpected intrinsic for Neon base update");
22612 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
22613 NumVecs = 2; break;
22614 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
22615 NumVecs = 3; break;
22616 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
22617 NumVecs = 4; break;
22618 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
22619 NumVecs = 2; IsStore = true; break;
22620 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
22621 NumVecs = 3; IsStore = true; break;
22622 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
22623 NumVecs = 4; IsStore = true; break;
22624 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
22625 NumVecs = 2; break;
22626 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
22627 NumVecs = 3; break;
22628 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
22629 NumVecs = 4; break;
22630 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
22631 NumVecs = 2; IsStore = true; break;
22632 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
22633 NumVecs = 3; IsStore = true; break;
22634 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
22635 NumVecs = 4; IsStore = true; break;
22636 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
22637 NumVecs = 2; IsDupOp = true; break;
22638 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
22639 NumVecs = 3; IsDupOp = true; break;
22640 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
22641 NumVecs = 4; IsDupOp = true; break;
22642 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
22643 NumVecs = 2; IsLaneOp = true; break;
22644 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
22645 NumVecs = 3; IsLaneOp = true; break;
22646 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
22647 NumVecs = 4; IsLaneOp = true; break;
22648 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
22649 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
22650 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
22651 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
22652 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
22653 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
22654 }
22655
22656 EVT VecTy;
22657 if (IsStore)
22658 VecTy = N->getOperand(2).getValueType();
22659 else
22660 VecTy = N->getValueType(0);
22661
22662 // If the increment is a constant, it must match the memory ref size.
22663 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22664 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22665 uint32_t IncVal = CInc->getZExtValue();
22666 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
22667 if (IsLaneOp || IsDupOp)
22668 NumBytes /= VecTy.getVectorNumElements();
22669 if (IncVal != NumBytes)
22670 continue;
22671 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22672 }
22674 Ops.push_back(N->getOperand(0)); // Incoming chain
22675 // Load lane and store have vector list as input.
22676 if (IsLaneOp || IsStore)
22677 for (unsigned i = 2; i < AddrOpIdx; ++i)
22678 Ops.push_back(N->getOperand(i));
22679 Ops.push_back(Addr); // Base register
22680 Ops.push_back(Inc);
22681
22682 // Return Types.
22683 EVT Tys[6];
22684 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
22685 unsigned n;
22686 for (n = 0; n < NumResultVecs; ++n)
22687 Tys[n] = VecTy;
22688 Tys[n++] = MVT::i64; // Type of write back register
22689 Tys[n] = MVT::Other; // Type of the chain
22690 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
22691
22692 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
22693 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
22694 MemInt->getMemoryVT(),
22695 MemInt->getMemOperand());
22696
22697 // Update the uses.
22698 std::vector<SDValue> NewResults;
22699 for (unsigned i = 0; i < NumResultVecs; ++i) {
22700 NewResults.push_back(SDValue(UpdN.getNode(), i));
22701 }
22702 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
22703 DCI.CombineTo(N, NewResults);
22704 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
22705
22706 break;
22707 }
22708 return SDValue();
22709}
22710
22711// Checks to see if the value is the prescribed width and returns information
22712// about its extension mode.
22713static
22714bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
22715 ExtType = ISD::NON_EXTLOAD;
22716 switch(V.getNode()->getOpcode()) {
22717 default:
22718 return false;
22719 case ISD::LOAD: {
22720 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
22721 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
22722 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
22723 ExtType = LoadNode->getExtensionType();
22724 return true;
22725 }
22726 return false;
22727 }
22728 case ISD::AssertSext: {
22729 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22730 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22731 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22732 ExtType = ISD::SEXTLOAD;
22733 return true;
22734 }
22735 return false;
22736 }
22737 case ISD::AssertZext: {
22738 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
22739 if ((TypeNode->getVT() == MVT::i8 && width == 8)
22740 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
22741 ExtType = ISD::ZEXTLOAD;
22742 return true;
22743 }
22744 return false;
22745 }
22746 case ISD::Constant:
22747 case ISD::TargetConstant: {
22748 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
22749 1LL << (width - 1);
22750 }
22751 }
22752
22753 return true;
22754}
22755
22756// This function does a whole lot of voodoo to determine if the tests are
22757// equivalent without and with a mask. Essentially what happens is that given a
22758// DAG resembling:
22759//
22760// +-------------+ +-------------+ +-------------+ +-------------+
22761// | Input | | AddConstant | | CompConstant| | CC |
22762// +-------------+ +-------------+ +-------------+ +-------------+
22763// | | | |
22764// V V | +----------+
22765// +-------------+ +----+ | |
22766// | ADD | |0xff| | |
22767// +-------------+ +----+ | |
22768// | | | |
22769// V V | |
22770// +-------------+ | |
22771// | AND | | |
22772// +-------------+ | |
22773// | | |
22774// +-----+ | |
22775// | | |
22776// V V V
22777// +-------------+
22778// | CMP |
22779// +-------------+
22780//
22781// The AND node may be safely removed for some combinations of inputs. In
22782// particular we need to take into account the extension type of the Input,
22783// the exact values of AddConstant, CompConstant, and CC, along with the nominal
22784// width of the input (this can work for any width inputs, the above graph is
22785// specific to 8 bits.
22786//
22787// The specific equations were worked out by generating output tables for each
22788// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
22789// problem was simplified by working with 4 bit inputs, which means we only
22790// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
22791// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
22792// patterns present in both extensions (0,7). For every distinct set of
22793// AddConstant and CompConstants bit patterns we can consider the masked and
22794// unmasked versions to be equivalent if the result of this function is true for
22795// all 16 distinct bit patterns of for the current extension type of Input (w0).
22796//
22797// sub w8, w0, w1
22798// and w10, w8, #0x0f
22799// cmp w8, w2
22800// cset w9, AArch64CC
22801// cmp w10, w2
22802// cset w11, AArch64CC
22803// cmp w9, w11
22804// cset w0, eq
22805// ret
22806//
22807// Since the above function shows when the outputs are equivalent it defines
22808// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
22809// would be expensive to run during compiles. The equations below were written
22810// in a test harness that confirmed they gave equivalent outputs to the above
22811// for all inputs function, so they can be used determine if the removal is
22812// legal instead.
22813//
22814// isEquivalentMaskless() is the code for testing if the AND can be removed
22815// factored out of the DAG recognition as the DAG can take several forms.
22816
22817static bool isEquivalentMaskless(unsigned CC, unsigned width,
22818 ISD::LoadExtType ExtType, int AddConstant,
22819 int CompConstant) {
22820 // By being careful about our equations and only writing the in term
22821 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
22822 // make them generally applicable to all bit widths.
22823 int MaxUInt = (1 << width);
22824
22825 // For the purposes of these comparisons sign extending the type is
22826 // equivalent to zero extending the add and displacing it by half the integer
22827 // width. Provided we are careful and make sure our equations are valid over
22828 // the whole range we can just adjust the input and avoid writing equations
22829 // for sign extended inputs.
22830 if (ExtType == ISD::SEXTLOAD)
22831 AddConstant -= (1 << (width-1));
22832
22833 switch(CC) {
22834 case AArch64CC::LE:
22835 case AArch64CC::GT:
22836 if ((AddConstant == 0) ||
22837 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
22838 (AddConstant >= 0 && CompConstant < 0) ||
22839 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
22840 return true;
22841 break;
22842 case AArch64CC::LT:
22843 case AArch64CC::GE:
22844 if ((AddConstant == 0) ||
22845 (AddConstant >= 0 && CompConstant <= 0) ||
22846 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
22847 return true;
22848 break;
22849 case AArch64CC::HI:
22850 case AArch64CC::LS:
22851 if ((AddConstant >= 0 && CompConstant < 0) ||
22852 (AddConstant <= 0 && CompConstant >= -1 &&
22853 CompConstant < AddConstant + MaxUInt))
22854 return true;
22855 break;
22856 case AArch64CC::PL:
22857 case AArch64CC::MI:
22858 if ((AddConstant == 0) ||
22859 (AddConstant > 0 && CompConstant <= 0) ||
22860 (AddConstant < 0 && CompConstant <= AddConstant))
22861 return true;
22862 break;
22863 case AArch64CC::LO:
22864 case AArch64CC::HS:
22865 if ((AddConstant >= 0 && CompConstant <= 0) ||
22866 (AddConstant <= 0 && CompConstant >= 0 &&
22867 CompConstant <= AddConstant + MaxUInt))
22868 return true;
22869 break;
22870 case AArch64CC::EQ:
22871 case AArch64CC::NE:
22872 if ((AddConstant > 0 && CompConstant < 0) ||
22873 (AddConstant < 0 && CompConstant >= 0 &&
22874 CompConstant < AddConstant + MaxUInt) ||
22875 (AddConstant >= 0 && CompConstant >= 0 &&
22876 CompConstant >= AddConstant) ||
22877 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
22878 return true;
22879 break;
22880 case AArch64CC::VS:
22881 case AArch64CC::VC:
22882 case AArch64CC::AL:
22883 case AArch64CC::NV:
22884 return true;
22885 case AArch64CC::Invalid:
22886 break;
22887 }
22888
22889 return false;
22890}
22891
22892// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
22893// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
22895 SDNode *AndNode, SelectionDAG &DAG,
22896 unsigned CCIndex, unsigned CmpIndex,
22897 unsigned CC) {
22898 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
22899 if (!SubsC)
22900 return SDValue();
22901
22902 APInt SubsAP = SubsC->getAPIntValue();
22903 if (CC == AArch64CC::HI) {
22904 if (!SubsAP.isMask())
22905 return SDValue();
22906 } else if (CC == AArch64CC::LO) {
22907 if (!SubsAP.isPowerOf2())
22908 return SDValue();
22909 } else
22910 return SDValue();
22911
22912 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
22913 if (!AndC)
22914 return SDValue();
22915
22916 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
22917
22918 SDLoc DL(N);
22919 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
22920 SDValue ANDS = DAG.getNode(
22921 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
22922 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
22923 SDValue AArch64_CC =
22925 N->getOperand(CCIndex)->getValueType(0));
22926
22927 // For now, only performCSELCombine and performBRCONDCombine call this
22928 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
22929 // operands. So just init the ops direct to simplify the code. If we have some
22930 // other case with different CCIndex, CmpIndex, we need to use for loop to
22931 // rewrite the code here.
22932 // TODO: Do we need to assert number of operand is 4 here?
22933 assert((CCIndex == 2 && CmpIndex == 3) &&
22934 "Expected CCIndex to be 2 and CmpIndex to be 3.");
22935 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
22936 ANDS.getValue(1)};
22937 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
22938}
22939
22940static
22943 SelectionDAG &DAG, unsigned CCIndex,
22944 unsigned CmpIndex) {
22945 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
22946 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
22947 unsigned CondOpcode = SubsNode->getOpcode();
22948
22949 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
22950 !SubsNode->hasOneUse())
22951 return SDValue();
22952
22953 // There is a SUBS feeding this condition. Is it fed by a mask we can
22954 // use?
22955
22956 SDNode *AndNode = SubsNode->getOperand(0).getNode();
22957 unsigned MaskBits = 0;
22958
22959 if (AndNode->getOpcode() != ISD::AND)
22960 return SDValue();
22961
22962 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
22963 CmpIndex, CC))
22964 return Val;
22965
22966 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
22967 uint32_t CNV = CN->getZExtValue();
22968 if (CNV == 255)
22969 MaskBits = 8;
22970 else if (CNV == 65535)
22971 MaskBits = 16;
22972 }
22973
22974 if (!MaskBits)
22975 return SDValue();
22976
22977 SDValue AddValue = AndNode->getOperand(0);
22978
22979 if (AddValue.getOpcode() != ISD::ADD)
22980 return SDValue();
22981
22982 // The basic dag structure is correct, grab the inputs and validate them.
22983
22984 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
22985 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
22986 SDValue SubsInputValue = SubsNode->getOperand(1);
22987
22988 // The mask is present and the provenance of all the values is a smaller type,
22989 // lets see if the mask is superfluous.
22990
22991 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
22992 !isa<ConstantSDNode>(SubsInputValue.getNode()))
22993 return SDValue();
22994
22995 ISD::LoadExtType ExtType;
22996
22997 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
22998 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
22999 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
23000 return SDValue();
23001
23002 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
23003 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
23004 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
23005 return SDValue();
23006
23007 // The AND is not necessary, remove it.
23008
23009 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
23010 SubsNode->getValueType(1));
23011 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
23012
23013 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
23014 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
23015
23016 return SDValue(N, 0);
23017}
23018
23019// Optimize compare with zero and branch.
23022 SelectionDAG &DAG) {
23024 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
23025 // will not be produced, as they are conditional branch instructions that do
23026 // not set flags.
23027 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
23028 return SDValue();
23029
23030 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
23031 N = NV.getNode();
23032 SDValue Chain = N->getOperand(0);
23033 SDValue Dest = N->getOperand(1);
23034 SDValue CCVal = N->getOperand(2);
23035 SDValue Cmp = N->getOperand(3);
23036
23037 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
23038 unsigned CC = CCVal->getAsZExtVal();
23039 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
23040 return SDValue();
23041
23042 unsigned CmpOpc = Cmp.getOpcode();
23043 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
23044 return SDValue();
23045
23046 // Only attempt folding if there is only one use of the flag and no use of the
23047 // value.
23048 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
23049 return SDValue();
23050
23051 SDValue LHS = Cmp.getOperand(0);
23052 SDValue RHS = Cmp.getOperand(1);
23053
23054 assert(LHS.getValueType() == RHS.getValueType() &&
23055 "Expected the value type to be the same for both operands!");
23056 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
23057 return SDValue();
23058
23059 if (isNullConstant(LHS))
23060 std::swap(LHS, RHS);
23061
23062 if (!isNullConstant(RHS))
23063 return SDValue();
23064
23065 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
23066 LHS.getOpcode() == ISD::SRL)
23067 return SDValue();
23068
23069 // Fold the compare into the branch instruction.
23070 SDValue BR;
23071 if (CC == AArch64CC::EQ)
23072 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23073 else
23074 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23075
23076 // Do not add new nodes to DAG combiner worklist.
23077 DCI.CombineTo(N, BR, false);
23078
23079 return SDValue();
23080}
23081
23083 unsigned CC = N->getConstantOperandVal(2);
23084 SDValue SUBS = N->getOperand(3);
23085 SDValue Zero, CTTZ;
23086
23087 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
23088 Zero = N->getOperand(0);
23089 CTTZ = N->getOperand(1);
23090 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
23091 Zero = N->getOperand(1);
23092 CTTZ = N->getOperand(0);
23093 } else
23094 return SDValue();
23095
23096 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
23097 (CTTZ.getOpcode() == ISD::TRUNCATE &&
23098 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
23099 return SDValue();
23100
23101 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
23102 "Illegal type in CTTZ folding");
23103
23104 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
23105 return SDValue();
23106
23107 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
23108 ? CTTZ.getOperand(0).getOperand(0)
23109 : CTTZ.getOperand(0);
23110
23111 if (X != SUBS.getOperand(0))
23112 return SDValue();
23113
23114 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23115 ? CTTZ.getOperand(0).getValueSizeInBits()
23116 : CTTZ.getValueSizeInBits();
23117 SDValue BitWidthMinusOne =
23118 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
23119 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
23120 BitWidthMinusOne);
23121}
23122
23123// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23124// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23125// Where x and y are constants and x != y
23126
23127// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23128// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23129// Where x and y are constants and x != y
23131 SDValue L = Op->getOperand(0);
23132 SDValue R = Op->getOperand(1);
23133 AArch64CC::CondCode OpCC =
23134 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
23135
23136 SDValue OpCmp = Op->getOperand(3);
23137 if (!isCMP(OpCmp))
23138 return SDValue();
23139
23140 SDValue CmpLHS = OpCmp.getOperand(0);
23141 SDValue CmpRHS = OpCmp.getOperand(1);
23142
23143 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23144 std::swap(CmpLHS, CmpRHS);
23145 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23146 return SDValue();
23147
23148 SDValue X = CmpLHS->getOperand(0);
23149 SDValue Y = CmpLHS->getOperand(1);
23150 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
23151 return SDValue();
23152 }
23153
23154 // If one of the constant is opaque constant, x,y sdnode is still different
23155 // but the real value maybe the same. So check APInt here to make sure the
23156 // code is correct.
23157 ConstantSDNode *CX = cast<ConstantSDNode>(X);
23158 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
23159 if (CX->getAPIntValue() == CY->getAPIntValue())
23160 return SDValue();
23161
23163 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
23164 SDValue Cond = CmpLHS->getOperand(3);
23165
23166 if (CmpRHS == Y)
23168 else if (CmpRHS != X)
23169 return SDValue();
23170
23171 if (OpCC == AArch64CC::NE)
23173 else if (OpCC != AArch64CC::EQ)
23174 return SDValue();
23175
23176 SDLoc DL(Op);
23177 EVT VT = Op->getValueType(0);
23178
23179 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23180 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23181}
23182
23183// Optimize CSEL instructions
23186 SelectionDAG &DAG) {
23187 // CSEL x, x, cc -> x
23188 if (N->getOperand(0) == N->getOperand(1))
23189 return N->getOperand(0);
23190
23191 if (SDValue R = foldCSELOfCSEL(N, DAG))
23192 return R;
23193
23194 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23195 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23196 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23197 return Folded;
23198
23199 return performCONDCombine(N, DCI, DAG, 2, 3);
23200}
23201
23202// Try to re-use an already extended operand of a vector SetCC feeding a
23203// extended select. Doing so avoids requiring another full extension of the
23204// SET_CC result when lowering the select.
23206 EVT Op0MVT = Op->getOperand(0).getValueType();
23207 if (!Op0MVT.isVector() || Op->use_empty())
23208 return SDValue();
23209
23210 // Make sure that all uses of Op are VSELECTs with result matching types where
23211 // the result type has a larger element type than the SetCC operand.
23212 SDNode *FirstUse = *Op->use_begin();
23213 if (FirstUse->getOpcode() != ISD::VSELECT)
23214 return SDValue();
23215 EVT UseMVT = FirstUse->getValueType(0);
23216 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23217 return SDValue();
23218 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23219 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23220 }))
23221 return SDValue();
23222
23223 APInt V;
23224 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23225 return SDValue();
23226
23227 SDLoc DL(Op);
23228 SDValue Op0ExtV;
23229 SDValue Op1ExtV;
23230 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23231 // Check if the first operand of the SET_CC is already extended. If it is,
23232 // split the SET_CC and re-use the extended version of the operand.
23233 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23234 Op->getOperand(0));
23235 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23236 Op->getOperand(0));
23237 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23238 Op0ExtV = SDValue(Op0SExt, 0);
23239 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23240 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23241 Op0ExtV = SDValue(Op0ZExt, 0);
23242 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23243 } else
23244 return SDValue();
23245
23246 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23247 Op0ExtV, Op1ExtV, Op->getOperand(2));
23248}
23249
23250static SDValue
23252 SelectionDAG &DAG) {
23253 SDValue Vec = N->getOperand(0);
23254 if (DCI.isBeforeLegalize() &&
23255 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23258 SDLoc DL(N);
23259 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23260 DAG);
23261 }
23262
23263 return SDValue();
23264}
23265
23268 SelectionDAG &DAG) {
23269 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23270 SDValue LHS = N->getOperand(0);
23271 SDValue RHS = N->getOperand(1);
23272 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23273 SDLoc DL(N);
23274 EVT VT = N->getValueType(0);
23275
23276 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23277 return V;
23278
23279 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23280 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23281 LHS->getOpcode() == AArch64ISD::CSEL &&
23282 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23283 LHS->hasOneUse()) {
23284 // Invert CSEL's condition.
23285 auto OldCond =
23286 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23287 auto NewCond = getInvertedCondCode(OldCond);
23288
23289 // csel 0, 1, !cond, X
23290 SDValue CSEL =
23291 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23292 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23293 LHS.getOperand(3));
23294 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23295 }
23296
23297 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23298 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23299 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23300 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23301 LHS->hasOneUse()) {
23302 EVT TstVT = LHS->getValueType(0);
23303 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23304 // this pattern will get better opt in emitComparison
23305 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23306 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23307 DAG.getConstant(TstImm, DL, TstVT));
23308 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23309 }
23310 }
23311
23312 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23313 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23314 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23315 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23316 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23317 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23319 LHS->getOpcode() == ISD::BITCAST) {
23320 EVT ToVT = LHS->getValueType(0);
23321 EVT FromVT = LHS->getOperand(0).getValueType();
23322 if (FromVT.isFixedLengthVector() &&
23323 FromVT.getVectorElementType() == MVT::i1) {
23324 bool IsNull = isNullConstant(RHS);
23326 DL, MVT::i1, LHS->getOperand(0));
23327 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23328 LHS);
23329 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23330 }
23331 }
23332
23333 // Try to perform the memcmp when the result is tested for [in]equality with 0
23334 if (SDValue V = performOrXorChainCombine(N, DAG))
23335 return V;
23336
23337 return SDValue();
23338}
23339
23340// Replace a flag-setting operator (eg ANDS) with the generic version
23341// (eg AND) if the flag is unused.
23344 unsigned GenericOpcode) {
23345 SDLoc DL(N);
23346 SDValue LHS = N->getOperand(0);
23347 SDValue RHS = N->getOperand(1);
23348 EVT VT = N->getValueType(0);
23349
23350 // If the flag result isn't used, convert back to a generic opcode.
23351 if (!N->hasAnyUseOfValue(1)) {
23352 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23353 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23354 DL);
23355 }
23356
23357 // Combine identical generic nodes into this node, re-using the result.
23358 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23359 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23360 DCI.CombineTo(Generic, SDValue(N, 0));
23361
23362 return SDValue();
23363}
23364
23366 // setcc_merge_zero pred
23367 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23368 // => extract_subvector (inner setcc_merge_zero)
23369 SDValue Pred = N->getOperand(0);
23370 SDValue LHS = N->getOperand(1);
23371 SDValue RHS = N->getOperand(2);
23372 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23373
23374 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23375 LHS->getOpcode() != ISD::SIGN_EXTEND)
23376 return SDValue();
23377
23378 SDValue Extract = LHS->getOperand(0);
23379 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23380 Extract->getValueType(0) != N->getValueType(0) ||
23381 Extract->getConstantOperandVal(1) != 0)
23382 return SDValue();
23383
23384 SDValue InnerSetCC = Extract->getOperand(0);
23385 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23386 return SDValue();
23387
23388 // By this point we've effectively got
23389 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23390 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23391 // can operate on A directly.
23392 SDValue InnerPred = InnerSetCC.getOperand(0);
23393 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23394 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23395 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
23396 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
23397 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
23398 return Extract;
23399
23400 return SDValue();
23401}
23402
23403static SDValue
23405 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23406 "Unexpected opcode!");
23407
23408 SelectionDAG &DAG = DCI.DAG;
23409 SDValue Pred = N->getOperand(0);
23410 SDValue LHS = N->getOperand(1);
23411 SDValue RHS = N->getOperand(2);
23412 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23413
23414 if (SDValue V = performSetCCPunpkCombine(N, DAG))
23415 return V;
23416
23417 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
23418 LHS->getOpcode() == ISD::SIGN_EXTEND &&
23419 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
23420 // setcc_merge_zero(
23421 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
23422 // => setcc_merge_zero(pred, ...)
23423 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
23424 LHS->getOperand(0)->getOperand(0) == Pred)
23425 return LHS->getOperand(0);
23426
23427 // setcc_merge_zero(
23428 // all_active, extend(nxvNi1 ...), != splat(0))
23429 // -> nxvNi1 ...
23430 if (isAllActivePredicate(DAG, Pred))
23431 return LHS->getOperand(0);
23432
23433 // setcc_merge_zero(
23434 // pred, extend(nxvNi1 ...), != splat(0))
23435 // -> nxvNi1 and(pred, ...)
23436 if (DCI.isAfterLegalizeDAG())
23437 // Do this after legalization to allow more folds on setcc_merge_zero
23438 // to be recognized.
23439 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
23440 LHS->getOperand(0), Pred);
23441 }
23442
23443 return SDValue();
23444}
23445
23446// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
23447// as well as whether the test should be inverted. This code is required to
23448// catch these cases (as opposed to standard dag combines) because
23449// AArch64ISD::TBZ is matched during legalization.
23450static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
23451 SelectionDAG &DAG) {
23452
23453 if (!Op->hasOneUse())
23454 return Op;
23455
23456 // We don't handle undef/constant-fold cases below, as they should have
23457 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
23458 // etc.)
23459
23460 // (tbz (trunc x), b) -> (tbz x, b)
23461 // This case is just here to enable more of the below cases to be caught.
23462 if (Op->getOpcode() == ISD::TRUNCATE &&
23463 Bit < Op->getValueType(0).getSizeInBits()) {
23464 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23465 }
23466
23467 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
23468 if (Op->getOpcode() == ISD::ANY_EXTEND &&
23469 Bit < Op->getOperand(0).getValueSizeInBits()) {
23470 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23471 }
23472
23473 if (Op->getNumOperands() != 2)
23474 return Op;
23475
23476 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
23477 if (!C)
23478 return Op;
23479
23480 switch (Op->getOpcode()) {
23481 default:
23482 return Op;
23483
23484 // (tbz (and x, m), b) -> (tbz x, b)
23485 case ISD::AND:
23486 if ((C->getZExtValue() >> Bit) & 1)
23487 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23488 return Op;
23489
23490 // (tbz (shl x, c), b) -> (tbz x, b-c)
23491 case ISD::SHL:
23492 if (C->getZExtValue() <= Bit &&
23493 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23494 Bit = Bit - C->getZExtValue();
23495 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23496 }
23497 return Op;
23498
23499 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
23500 case ISD::SRA:
23501 Bit = Bit + C->getZExtValue();
23502 if (Bit >= Op->getValueType(0).getSizeInBits())
23503 Bit = Op->getValueType(0).getSizeInBits() - 1;
23504 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23505
23506 // (tbz (srl x, c), b) -> (tbz x, b+c)
23507 case ISD::SRL:
23508 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
23509 Bit = Bit + C->getZExtValue();
23510 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23511 }
23512 return Op;
23513
23514 // (tbz (xor x, -1), b) -> (tbnz x, b)
23515 case ISD::XOR:
23516 if ((C->getZExtValue() >> Bit) & 1)
23517 Invert = !Invert;
23518 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
23519 }
23520}
23521
23522// Optimize test single bit zero/non-zero and branch.
23525 SelectionDAG &DAG) {
23526 unsigned Bit = N->getConstantOperandVal(2);
23527 bool Invert = false;
23528 SDValue TestSrc = N->getOperand(1);
23529 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
23530
23531 if (TestSrc == NewTestSrc)
23532 return SDValue();
23533
23534 unsigned NewOpc = N->getOpcode();
23535 if (Invert) {
23536 if (NewOpc == AArch64ISD::TBZ)
23537 NewOpc = AArch64ISD::TBNZ;
23538 else {
23539 assert(NewOpc == AArch64ISD::TBNZ);
23540 NewOpc = AArch64ISD::TBZ;
23541 }
23542 }
23543
23544 SDLoc DL(N);
23545 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
23546 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
23547}
23548
23549// Swap vselect operands where it may allow a predicated operation to achieve
23550// the `sel`.
23551//
23552// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
23553// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
23555 auto SelectA = N->getOperand(1);
23556 auto SelectB = N->getOperand(2);
23557 auto NTy = N->getValueType(0);
23558
23559 if (!NTy.isScalableVector())
23560 return SDValue();
23561 SDValue SetCC = N->getOperand(0);
23562 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
23563 return SDValue();
23564
23565 switch (SelectB.getOpcode()) {
23566 default:
23567 return SDValue();
23568 case ISD::FMUL:
23569 case ISD::FSUB:
23570 case ISD::FADD:
23571 break;
23572 }
23573 if (SelectA != SelectB.getOperand(0))
23574 return SDValue();
23575
23576 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
23577 ISD::CondCode InverseCC =
23579 auto InverseSetCC =
23580 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
23581 SetCC.getOperand(1), InverseCC);
23582
23583 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
23584 {InverseSetCC, SelectB, SelectA});
23585}
23586
23587// vselect (v1i1 setcc) ->
23588// vselect (v1iXX setcc) (XX is the size of the compared operand type)
23589// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
23590// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
23591// such VSELECT.
23593 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
23594 return SwapResult;
23595
23596 SDValue N0 = N->getOperand(0);
23597 EVT CCVT = N0.getValueType();
23598
23599 if (isAllActivePredicate(DAG, N0))
23600 return N->getOperand(1);
23601
23602 if (isAllInactivePredicate(N0))
23603 return N->getOperand(2);
23604
23605 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
23606 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
23607 // supported types.
23608 SDValue SetCC = N->getOperand(0);
23609 if (SetCC.getOpcode() == ISD::SETCC &&
23610 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
23611 SDValue CmpLHS = SetCC.getOperand(0);
23612 EVT VT = CmpLHS.getValueType();
23613 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
23614 SDNode *SplatLHS = N->getOperand(1).getNode();
23615 SDNode *SplatRHS = N->getOperand(2).getNode();
23616 APInt SplatLHSVal;
23617 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
23618 VT.isSimple() &&
23619 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
23620 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
23621 VT.getSimpleVT().SimpleTy) &&
23622 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
23623 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
23625 unsigned NumElts = VT.getVectorNumElements();
23627 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
23628 VT.getScalarType()));
23629 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
23630
23631 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
23632 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
23633 return Or;
23634 }
23635 }
23636
23637 EVT CmpVT = N0.getOperand(0).getValueType();
23638 if (N0.getOpcode() != ISD::SETCC ||
23640 CCVT.getVectorElementType() != MVT::i1 ||
23642 return SDValue();
23643
23644 EVT ResVT = N->getValueType(0);
23645 // Only combine when the result type is of the same size as the compared
23646 // operands.
23647 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
23648 return SDValue();
23649
23650 SDValue IfTrue = N->getOperand(1);
23651 SDValue IfFalse = N->getOperand(2);
23652 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
23653 N0.getOperand(0), N0.getOperand(1),
23654 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23655 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
23656 IfTrue, IfFalse);
23657}
23658
23659/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
23660/// the compare-mask instructions rather than going via NZCV, even if LHS and
23661/// RHS are really scalar. This replaces any scalar setcc in the above pattern
23662/// with a vector one followed by a DUP shuffle on the result.
23665 SelectionDAG &DAG = DCI.DAG;
23666 SDValue N0 = N->getOperand(0);
23667 EVT ResVT = N->getValueType(0);
23668
23669 if (N0.getOpcode() != ISD::SETCC)
23670 return SDValue();
23671
23672 if (ResVT.isScalableVT())
23673 return SDValue();
23674
23675 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
23676 // scalar SetCCResultType. We also don't expect vectors, because we assume
23677 // that selects fed by vector SETCCs are canonicalized to VSELECT.
23678 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
23679 "Scalar-SETCC feeding SELECT has unexpected result type!");
23680
23681 // If NumMaskElts == 0, the comparison is larger than select result. The
23682 // largest real NEON comparison is 64-bits per lane, which means the result is
23683 // at most 32-bits and an illegal vector. Just bail out for now.
23684 EVT SrcVT = N0.getOperand(0).getValueType();
23685
23686 // Don't try to do this optimization when the setcc itself has i1 operands.
23687 // There are no legal vectors of i1, so this would be pointless. v1f16 is
23688 // ruled out to prevent the creation of setcc that need to be scalarized.
23689 if (SrcVT == MVT::i1 ||
23690 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
23691 return SDValue();
23692
23693 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
23694 if (!ResVT.isVector() || NumMaskElts == 0)
23695 return SDValue();
23696
23697 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
23699
23700 // Also bail out if the vector CCVT isn't the same size as ResVT.
23701 // This can happen if the SETCC operand size doesn't divide the ResVT size
23702 // (e.g., f64 vs v3f32).
23703 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
23704 return SDValue();
23705
23706 // Make sure we didn't create illegal types, if we're not supposed to.
23707 assert(DCI.isBeforeLegalize() ||
23708 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
23709
23710 // First perform a vector comparison, where lane 0 is the one we're interested
23711 // in.
23712 SDLoc DL(N0);
23713 SDValue LHS =
23714 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
23715 SDValue RHS =
23716 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
23717 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
23718
23719 // Now duplicate the comparison mask we want across all other lanes.
23720 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
23721 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
23722 Mask = DAG.getNode(ISD::BITCAST, DL,
23723 ResVT.changeVectorElementTypeToInteger(), Mask);
23724
23725 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
23726}
23727
23730 EVT VT = N->getValueType(0);
23731 SDLoc DL(N);
23732 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
23733 // 128bit vector version.
23734 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
23736 SmallVector<SDValue> Ops(N->ops());
23737 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
23738 DCI.DAG.getVTList(LVT), Ops)) {
23739 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
23740 DCI.DAG.getConstant(0, DL, MVT::i64));
23741 }
23742 }
23743
23744 if (N->getOpcode() == AArch64ISD::DUP) {
23745 if (DCI.isAfterLegalizeDAG()) {
23746 // If scalar dup's operand is extract_vector_elt, try to combine them into
23747 // duplane. For example,
23748 //
23749 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
23750 // t18: v4i32 = AArch64ISD::DUP t21
23751 // ==>
23752 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
23753 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
23754 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23755 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
23756 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
23757 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
23758 EXTRACT_VEC_ELT.getOperand(1));
23759 }
23760 }
23761 }
23762
23763 return performPostLD1Combine(N, DCI, false);
23764 }
23765
23766 return SDValue();
23767}
23768
23769/// Get rid of unnecessary NVCASTs (that don't change the type).
23771 if (N->getValueType(0) == N->getOperand(0).getValueType())
23772 return N->getOperand(0);
23773 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
23774 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
23775 N->getOperand(0).getOperand(0));
23776
23777 return SDValue();
23778}
23779
23780// If all users of the globaladdr are of the form (globaladdr + constant), find
23781// the smallest constant, fold it into the globaladdr's offset and rewrite the
23782// globaladdr as (globaladdr + constant) - constant.
23784 const AArch64Subtarget *Subtarget,
23785 const TargetMachine &TM) {
23786 auto *GN = cast<GlobalAddressSDNode>(N);
23787 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
23789 return SDValue();
23790
23791 uint64_t MinOffset = -1ull;
23792 for (SDNode *N : GN->uses()) {
23793 if (N->getOpcode() != ISD::ADD)
23794 return SDValue();
23795 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
23796 if (!C)
23797 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
23798 if (!C)
23799 return SDValue();
23800 MinOffset = std::min(MinOffset, C->getZExtValue());
23801 }
23802 uint64_t Offset = MinOffset + GN->getOffset();
23803
23804 // Require that the new offset is larger than the existing one. Otherwise, we
23805 // can end up oscillating between two possible DAGs, for example,
23806 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
23807 if (Offset <= uint64_t(GN->getOffset()))
23808 return SDValue();
23809
23810 // Check whether folding this offset is legal. It must not go out of bounds of
23811 // the referenced object to avoid violating the code model, and must be
23812 // smaller than 2^20 because this is the largest offset expressible in all
23813 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
23814 // stores an immediate signed 21 bit offset.)
23815 //
23816 // This check also prevents us from folding negative offsets, which will end
23817 // up being treated in the same way as large positive ones. They could also
23818 // cause code model violations, and aren't really common enough to matter.
23819 if (Offset >= (1 << 20))
23820 return SDValue();
23821
23822 const GlobalValue *GV = GN->getGlobal();
23823 Type *T = GV->getValueType();
23824 if (!T->isSized() ||
23826 return SDValue();
23827
23828 SDLoc DL(GN);
23829 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
23830 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
23831 DAG.getConstant(MinOffset, DL, MVT::i64));
23832}
23833
23835 const AArch64Subtarget *Subtarget) {
23836 SDValue BR = N->getOperand(0);
23837 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
23838 !BR.getValueType().isScalarInteger())
23839 return SDValue();
23840
23841 SDLoc DL(N);
23842 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
23843}
23844
23845// Turns the vector of indices into a vector of byte offstes by scaling Offset
23846// by (BitWidth / 8).
23848 SDLoc DL, unsigned BitWidth) {
23849 assert(Offset.getValueType().isScalableVector() &&
23850 "This method is only for scalable vectors of offsets");
23851
23852 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
23853 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
23854
23855 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
23856}
23857
23858/// Check if the value of \p OffsetInBytes can be used as an immediate for
23859/// the gather load/prefetch and scatter store instructions with vector base and
23860/// immediate offset addressing mode:
23861///
23862/// [<Zn>.[S|D]{, #<imm>}]
23863///
23864/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23865inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
23866 unsigned ScalarSizeInBytes) {
23867 // The immediate is not a multiple of the scalar size.
23868 if (OffsetInBytes % ScalarSizeInBytes)
23869 return false;
23870
23871 // The immediate is out of range.
23872 if (OffsetInBytes / ScalarSizeInBytes > 31)
23873 return false;
23874
23875 return true;
23876}
23877
23878/// Check if the value of \p Offset represents a valid immediate for the SVE
23879/// gather load/prefetch and scatter store instructiona with vector base and
23880/// immediate offset addressing mode:
23881///
23882/// [<Zn>.[S|D]{, #<imm>}]
23883///
23884/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
23886 unsigned ScalarSizeInBytes) {
23887 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
23888 return OffsetConst && isValidImmForSVEVecImmAddrMode(
23889 OffsetConst->getZExtValue(), ScalarSizeInBytes);
23890}
23891
23893 unsigned Opcode,
23894 bool OnlyPackedOffsets = true) {
23895 const SDValue Src = N->getOperand(2);
23896 const EVT SrcVT = Src->getValueType(0);
23897 assert(SrcVT.isScalableVector() &&
23898 "Scatter stores are only possible for SVE vectors");
23899
23900 SDLoc DL(N);
23901 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
23902
23903 // Make sure that source data will fit into an SVE register
23905 return SDValue();
23906
23907 // For FPs, ACLE only supports _packed_ single and double precision types.
23908 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
23909 if (SrcElVT.isFloatingPoint())
23910 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
23911 ((Opcode != AArch64ISD::SST1Q_PRED &&
23912 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
23913 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
23914 return SDValue();
23915
23916 // Depending on the addressing mode, this is either a pointer or a vector of
23917 // pointers (that fits into one register)
23918 SDValue Base = N->getOperand(4);
23919 // Depending on the addressing mode, this is either a single offset or a
23920 // vector of offsets (that fits into one register)
23921 SDValue Offset = N->getOperand(5);
23922
23923 // For "scalar + vector of indices", just scale the indices. This only
23924 // applies to non-temporal scatters because there's no instruction that takes
23925 // indicies.
23926 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
23927 Offset =
23929 Opcode = AArch64ISD::SSTNT1_PRED;
23930 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
23931 Offset =
23933 Opcode = AArch64ISD::SST1Q_PRED;
23934 }
23935
23936 // In the case of non-temporal gather loads there's only one SVE instruction
23937 // per data-size: "scalar + vector", i.e.
23938 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
23939 // Since we do have intrinsics that allow the arguments to be in a different
23940 // order, we may need to swap them to match the spec.
23941 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
23942 Offset.getValueType().isVector())
23944
23945 // SST1_IMM requires that the offset is an immediate that is:
23946 // * a multiple of #SizeInBytes,
23947 // * in the range [0, 31 x #SizeInBytes],
23948 // where #SizeInBytes is the size in bytes of the stored items. For
23949 // immediates outside that range and non-immediate scalar offsets use SST1 or
23950 // SST1_UXTW instead.
23951 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
23953 SrcVT.getScalarSizeInBits() / 8)) {
23954 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
23956 else
23957 Opcode = AArch64ISD::SST1_PRED;
23958
23960 }
23961 }
23962
23963 auto &TLI = DAG.getTargetLoweringInfo();
23964 if (!TLI.isTypeLegal(Base.getValueType()))
23965 return SDValue();
23966
23967 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
23968 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
23969 // nxv2i64. Legalize accordingly.
23970 if (!OnlyPackedOffsets &&
23971 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
23972 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
23973
23974 if (!TLI.isTypeLegal(Offset.getValueType()))
23975 return SDValue();
23976
23977 // Source value type that is representable in hardware
23978 EVT HwSrcVt = getSVEContainerType(SrcVT);
23979
23980 // Keep the original type of the input data to store - this is needed to be
23981 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
23982 // FP values we want the integer equivalent, so just use HwSrcVt.
23983 SDValue InputVT = DAG.getValueType(SrcVT);
23984 if (SrcVT.isFloatingPoint())
23985 InputVT = DAG.getValueType(HwSrcVt);
23986
23987 SDVTList VTs = DAG.getVTList(MVT::Other);
23988 SDValue SrcNew;
23989
23990 if (Src.getValueType().isFloatingPoint())
23991 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
23992 else
23993 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
23994
23995 SDValue Ops[] = {N->getOperand(0), // Chain
23996 SrcNew,
23997 N->getOperand(3), // Pg
23998 Base,
23999 Offset,
24000 InputVT};
24001
24002 return DAG.getNode(Opcode, DL, VTs, Ops);
24003}
24004
24006 unsigned Opcode,
24007 bool OnlyPackedOffsets = true) {
24008 const EVT RetVT = N->getValueType(0);
24009 assert(RetVT.isScalableVector() &&
24010 "Gather loads are only possible for SVE vectors");
24011
24012 SDLoc DL(N);
24013
24014 // Make sure that the loaded data will fit into an SVE register
24016 return SDValue();
24017
24018 // Depending on the addressing mode, this is either a pointer or a vector of
24019 // pointers (that fits into one register)
24020 SDValue Base = N->getOperand(3);
24021 // Depending on the addressing mode, this is either a single offset or a
24022 // vector of offsets (that fits into one register)
24023 SDValue Offset = N->getOperand(4);
24024
24025 // For "scalar + vector of indices", scale the indices to obtain unscaled
24026 // offsets. This applies to non-temporal and quadword gathers, which do not
24027 // have an addressing mode with scaled offset.
24030 RetVT.getScalarSizeInBits());
24032 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
24034 RetVT.getScalarSizeInBits());
24036 }
24037
24038 // In the case of non-temporal gather loads and quadword gather loads there's
24039 // only one addressing mode : "vector + scalar", e.g.
24040 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24041 // Since we do have intrinsics that allow the arguments to be in a different
24042 // order, we may need to swap them to match the spec.
24043 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
24044 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
24045 Offset.getValueType().isVector())
24047
24048 // GLD{FF}1_IMM requires that the offset is an immediate that is:
24049 // * a multiple of #SizeInBytes,
24050 // * in the range [0, 31 x #SizeInBytes],
24051 // where #SizeInBytes is the size in bytes of the loaded items. For
24052 // immediates outside that range and non-immediate scalar offsets use
24053 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
24054 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
24057 RetVT.getScalarSizeInBits() / 8)) {
24058 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24059 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24062 else
24063 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24066
24068 }
24069 }
24070
24071 auto &TLI = DAG.getTargetLoweringInfo();
24072 if (!TLI.isTypeLegal(Base.getValueType()))
24073 return SDValue();
24074
24075 // Some gather load variants allow unpacked offsets, but only as nxv2i32
24076 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24077 // nxv2i64. Legalize accordingly.
24078 if (!OnlyPackedOffsets &&
24079 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24080 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
24081
24082 // Return value type that is representable in hardware
24083 EVT HwRetVt = getSVEContainerType(RetVT);
24084
24085 // Keep the original output value type around - this is needed to be able to
24086 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
24087 // values we want the integer equivalent, so just use HwRetVT.
24088 SDValue OutVT = DAG.getValueType(RetVT);
24089 if (RetVT.isFloatingPoint())
24090 OutVT = DAG.getValueType(HwRetVt);
24091
24092 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
24093 SDValue Ops[] = {N->getOperand(0), // Chain
24094 N->getOperand(2), // Pg
24095 Base, Offset, OutVT};
24096
24097 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
24098 SDValue LoadChain = SDValue(Load.getNode(), 1);
24099
24100 if (RetVT.isInteger() && (RetVT != HwRetVt))
24101 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
24102
24103 // If the original return value was FP, bitcast accordingly. Doing it here
24104 // means that we can avoid adding TableGen patterns for FPs.
24105 if (RetVT.isFloatingPoint())
24106 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
24107
24108 return DAG.getMergeValues({Load, LoadChain}, DL);
24109}
24110
24111static SDValue
24113 SelectionDAG &DAG) {
24114 SDLoc DL(N);
24115 SDValue Src = N->getOperand(0);
24116 unsigned Opc = Src->getOpcode();
24117
24118 // Sign extend of an unsigned unpack -> signed unpack
24119 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24120
24121 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24123
24124 // Push the sign extend to the operand of the unpack
24125 // This is necessary where, for example, the operand of the unpack
24126 // is another unpack:
24127 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24128 // ->
24129 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24130 // ->
24131 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24132 SDValue ExtOp = Src->getOperand(0);
24133 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
24134 EVT EltTy = VT.getVectorElementType();
24135 (void)EltTy;
24136
24137 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24138 "Sign extending from an invalid type");
24139
24140 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
24141
24143 ExtOp, DAG.getValueType(ExtVT));
24144
24145 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
24146 }
24147
24148 if (DCI.isBeforeLegalizeOps())
24149 return SDValue();
24150
24152 return SDValue();
24153
24154 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24155 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24156 unsigned NewOpc;
24157 unsigned MemVTOpNum = 4;
24158 switch (Opc) {
24161 MemVTOpNum = 3;
24162 break;
24165 MemVTOpNum = 3;
24166 break;
24169 MemVTOpNum = 3;
24170 break;
24173 break;
24176 break;
24179 break;
24182 break;
24185 break;
24188 break;
24191 break;
24194 break;
24197 break;
24200 break;
24203 break;
24206 break;
24209 break;
24212 break;
24215 break;
24216 default:
24217 return SDValue();
24218 }
24219
24220 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24221 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24222
24223 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24224 return SDValue();
24225
24226 EVT DstVT = N->getValueType(0);
24227 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24228
24230 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24231 Ops.push_back(Src->getOperand(I));
24232
24233 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24234 DCI.CombineTo(N, ExtLoad);
24235 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24236
24237 // Return N so it doesn't get rechecked
24238 return SDValue(N, 0);
24239}
24240
24241/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24242/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24243/// != nxv2i32) do not need legalization.
24245 const unsigned OffsetPos = 4;
24246 SDValue Offset = N->getOperand(OffsetPos);
24247
24248 // Not an unpacked vector, bail out.
24249 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24250 return SDValue();
24251
24252 // Extend the unpacked offset vector to 64-bit lanes.
24253 SDLoc DL(N);
24254 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24255 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24256 // Replace the offset operand with the 64-bit one.
24257 Ops[OffsetPos] = Offset;
24258
24259 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24260}
24261
24262/// Combines a node carrying the intrinsic
24263/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24264/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24265/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24266/// sve gather prefetch instruction with vector plus immediate addressing mode.
24268 unsigned ScalarSizeInBytes) {
24269 const unsigned ImmPos = 4, OffsetPos = 3;
24270 // No need to combine the node if the immediate is valid...
24271 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24272 return SDValue();
24273
24274 // ...otherwise swap the offset base with the offset...
24275 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24276 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24277 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24278 // `aarch64_sve_prfb_gather_uxtw_index`.
24279 SDLoc DL(N);
24280 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24281 MVT::i64);
24282
24283 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24284}
24285
24286// Return true if the vector operation can guarantee only the first lane of its
24287// result contains data, with all bits in other lanes set to zero.
24289 switch (Op.getOpcode()) {
24290 default:
24291 return false;
24307 return true;
24308 }
24309}
24310
24312 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24313 SDValue InsertVec = N->getOperand(0);
24314 SDValue InsertElt = N->getOperand(1);
24315 SDValue InsertIdx = N->getOperand(2);
24316
24317 // We only care about inserts into the first element...
24318 if (!isNullConstant(InsertIdx))
24319 return SDValue();
24320 // ...of a zero'd vector...
24322 return SDValue();
24323 // ...where the inserted data was previously extracted...
24324 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24325 return SDValue();
24326
24327 SDValue ExtractVec = InsertElt.getOperand(0);
24328 SDValue ExtractIdx = InsertElt.getOperand(1);
24329
24330 // ...from the first element of a vector.
24331 if (!isNullConstant(ExtractIdx))
24332 return SDValue();
24333
24334 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24335
24336 // Ensure there's no type conversion going on.
24337 if (N->getValueType(0) != ExtractVec.getValueType())
24338 return SDValue();
24339
24340 if (!isLanes1toNKnownZero(ExtractVec))
24341 return SDValue();
24342
24343 // The explicit zeroing is redundant.
24344 return ExtractVec;
24345}
24346
24347static SDValue
24350 return Res;
24351
24352 return performPostLD1Combine(N, DCI, true);
24353}
24354
24357 const AArch64Subtarget *Subtarget) {
24358 SDValue N0 = N->getOperand(0);
24359 EVT VT = N->getValueType(0);
24360
24361 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24362 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24363 return SDValue();
24364
24365 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24366 EVT EltVT = VT.getVectorElementType();
24367 return EltVT == MVT::f32 || EltVT == MVT::f64;
24368 };
24369
24370 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24371 // We purposefully don't care about legality of the nodes here as we know
24372 // they can be split down into something legal.
24373 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24374 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24375 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24376 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24377 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24378 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24379 LN0->getChain(), LN0->getBasePtr(),
24380 N0.getValueType(), LN0->getMemOperand());
24381 DCI.CombineTo(N, ExtLoad);
24382 DCI.CombineTo(
24383 N0.getNode(),
24384 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24385 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24386 ExtLoad.getValue(1));
24387 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24388 }
24389
24390 return SDValue();
24391}
24392
24394 const AArch64Subtarget *Subtarget) {
24395 EVT VT = N->getValueType(0);
24396
24397 // Don't expand for NEON, SVE2 or SME
24398 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
24399 return SDValue();
24400
24401 SDLoc DL(N);
24402
24403 SDValue Mask = N->getOperand(0);
24404 SDValue In1 = N->getOperand(1);
24405 SDValue In2 = N->getOperand(2);
24406
24407 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
24408 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
24409 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
24410 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
24411}
24412
24414 EVT VT = N->getValueType(0);
24415
24416 SDValue Insert = N->getOperand(0);
24417 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
24418 return SDValue();
24419
24420 if (!Insert.getOperand(0).isUndef())
24421 return SDValue();
24422
24423 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
24424 uint64_t IdxDupLane = N->getConstantOperandVal(1);
24425 if (IdxInsert != 0 || IdxDupLane != 0)
24426 return SDValue();
24427
24428 SDValue Bitcast = Insert.getOperand(1);
24429 if (Bitcast.getOpcode() != ISD::BITCAST)
24430 return SDValue();
24431
24432 SDValue Subvec = Bitcast.getOperand(0);
24433 EVT SubvecVT = Subvec.getValueType();
24434 if (!SubvecVT.is128BitVector())
24435 return SDValue();
24436 EVT NewSubvecVT =
24438
24439 SDLoc DL(N);
24440 SDValue NewInsert =
24441 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
24442 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
24443 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
24444 NewInsert, N->getOperand(1));
24445 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
24446}
24447
24448// Try to combine mull with uzp1.
24451 SelectionDAG &DAG) {
24452 if (DCI.isBeforeLegalizeOps())
24453 return SDValue();
24454
24455 SDValue LHS = N->getOperand(0);
24456 SDValue RHS = N->getOperand(1);
24457
24458 SDValue ExtractHigh;
24459 SDValue ExtractLow;
24460 SDValue TruncHigh;
24461 SDValue TruncLow;
24462 SDLoc DL(N);
24463
24464 // Check the operands are trunc and extract_high.
24466 RHS.getOpcode() == ISD::TRUNCATE) {
24467 TruncHigh = RHS;
24468 if (LHS.getOpcode() == ISD::BITCAST)
24469 ExtractHigh = LHS.getOperand(0);
24470 else
24471 ExtractHigh = LHS;
24473 LHS.getOpcode() == ISD::TRUNCATE) {
24474 TruncHigh = LHS;
24475 if (LHS.getOpcode() == ISD::BITCAST)
24476 ExtractHigh = RHS.getOperand(0);
24477 else
24478 ExtractHigh = RHS;
24479 } else
24480 return SDValue();
24481
24482 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24483 // with uzp1.
24484 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24485 SDValue TruncHighOp = TruncHigh.getOperand(0);
24486 EVT TruncHighOpVT = TruncHighOp.getValueType();
24487 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
24488 DAG.isSplatValue(TruncHighOp, false))
24489 return SDValue();
24490
24491 // Check there is other extract_high with same source vector.
24492 // For example,
24493 //
24494 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
24495 // t12: v4i16 = truncate t11
24496 // t31: v4i32 = AArch64ISD::SMULL t18, t12
24497 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
24498 // t16: v4i16 = truncate t15
24499 // t30: v4i32 = AArch64ISD::SMULL t23, t1
24500 //
24501 // This dagcombine assumes the two extract_high uses same source vector in
24502 // order to detect the pair of the mull. If they have different source vector,
24503 // this code will not work.
24504 bool HasFoundMULLow = true;
24505 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
24506 if (ExtractHighSrcVec->use_size() != 2)
24507 HasFoundMULLow = false;
24508
24509 // Find ExtractLow.
24510 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
24511 if (User == ExtractHigh.getNode())
24512 continue;
24513
24514 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
24516 HasFoundMULLow = false;
24517 break;
24518 }
24519
24520 ExtractLow.setNode(User);
24521 }
24522
24523 if (!ExtractLow || !ExtractLow->hasOneUse())
24524 HasFoundMULLow = false;
24525
24526 // Check ExtractLow's user.
24527 if (HasFoundMULLow) {
24528 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
24529 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
24530 HasFoundMULLow = false;
24531 } else {
24532 if (ExtractLowUser->getOperand(0) == ExtractLow) {
24533 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
24534 TruncLow = ExtractLowUser->getOperand(1);
24535 else
24536 HasFoundMULLow = false;
24537 } else {
24538 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
24539 TruncLow = ExtractLowUser->getOperand(0);
24540 else
24541 HasFoundMULLow = false;
24542 }
24543 }
24544 }
24545
24546 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
24547 // with uzp1.
24548 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
24549 EVT TruncHighVT = TruncHigh.getValueType();
24550 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
24551 SDValue TruncLowOp =
24552 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
24553 EVT TruncLowOpVT = TruncLowOp.getValueType();
24554 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
24555 DAG.isSplatValue(TruncLowOp, false)))
24556 return SDValue();
24557
24558 // Create uzp1, extract_high and extract_low.
24559 if (TruncHighOpVT != UZP1VT)
24560 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
24561 if (TruncLowOpVT != UZP1VT)
24562 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
24563
24564 SDValue UZP1 =
24565 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
24566 SDValue HighIdxCst =
24567 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
24568 SDValue NewTruncHigh =
24569 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
24570 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
24571
24572 if (HasFoundMULLow) {
24573 EVT TruncLowVT = TruncLow.getValueType();
24574 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
24575 UZP1, ExtractLow.getOperand(1));
24576 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
24577 }
24578
24579 return SDValue(N, 0);
24580}
24581
24584 SelectionDAG &DAG) {
24585 if (SDValue Val =
24587 return Val;
24588
24589 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
24590 return Val;
24591
24592 return SDValue();
24593}
24594
24595static SDValue
24597 SelectionDAG &DAG) {
24598 // Let's do below transform.
24599 //
24600 // t34: v4i32 = AArch64ISD::UADDLV t2
24601 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
24602 // t7: i64 = zero_extend t35
24603 // t20: v1i64 = scalar_to_vector t7
24604 // ==>
24605 // t34: v4i32 = AArch64ISD::UADDLV t2
24606 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
24607 // t40: v1i64 = AArch64ISD::NVCAST t39
24608 if (DCI.isBeforeLegalizeOps())
24609 return SDValue();
24610
24611 EVT VT = N->getValueType(0);
24612 if (VT != MVT::v1i64)
24613 return SDValue();
24614
24615 SDValue ZEXT = N->getOperand(0);
24616 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
24617 return SDValue();
24618
24619 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
24620 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
24621 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
24622 return SDValue();
24623
24624 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
24625 return SDValue();
24626
24627 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
24628 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
24629 UADDLV.getValueType() != MVT::v4i32 ||
24630 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
24631 return SDValue();
24632
24633 // Let's generate new sequence with AArch64ISD::NVCAST.
24634 SDLoc DL(N);
24635 SDValue EXTRACT_SUBVEC =
24636 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
24637 DAG.getConstant(0, DL, MVT::i64));
24638 SDValue NVCAST =
24639 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
24640
24641 return NVCAST;
24642}
24643
24645 DAGCombinerInfo &DCI) const {
24646 SelectionDAG &DAG = DCI.DAG;
24647 switch (N->getOpcode()) {
24648 default:
24649 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
24650 break;
24651 case ISD::VECREDUCE_AND:
24652 case ISD::VECREDUCE_OR:
24653 case ISD::VECREDUCE_XOR:
24654 return performVecReduceBitwiseCombine(N, DCI, DAG);
24655 case ISD::ADD:
24656 case ISD::SUB:
24657 return performAddSubCombine(N, DCI);
24658 case ISD::BUILD_VECTOR:
24659 return performBuildVectorCombine(N, DCI, DAG);
24660 case ISD::TRUNCATE:
24661 return performTruncateCombine(N, DAG);
24662 case AArch64ISD::ANDS:
24663 return performFlagSettingCombine(N, DCI, ISD::AND);
24664 case AArch64ISD::ADC:
24665 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24666 return R;
24667 return foldADCToCINC(N, DAG);
24668 case AArch64ISD::SBC:
24669 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
24670 case AArch64ISD::ADCS:
24671 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
24672 return R;
24674 case AArch64ISD::SBCS:
24675 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
24676 return R;
24678 case AArch64ISD::BICi: {
24680 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
24681 APInt DemandedElts =
24682 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
24683
24685 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
24686 return SDValue();
24687
24688 break;
24689 }
24690 case ISD::XOR:
24691 return performXorCombine(N, DAG, DCI, Subtarget);
24692 case ISD::MUL:
24693 return performMulCombine(N, DAG, DCI, Subtarget);
24694 case ISD::SINT_TO_FP:
24695 case ISD::UINT_TO_FP:
24696 return performIntToFpCombine(N, DAG, Subtarget);
24697 case ISD::FP_TO_SINT:
24698 case ISD::FP_TO_UINT:
24701 return performFpToIntCombine(N, DAG, DCI, Subtarget);
24702 case ISD::FDIV:
24703 return performFDivCombine(N, DAG, DCI, Subtarget);
24704 case ISD::OR:
24705 return performORCombine(N, DCI, Subtarget, *this);
24706 case ISD::AND:
24707 return performANDCombine(N, DCI);
24708 case ISD::FADD:
24709 return performFADDCombine(N, DCI);
24711 return performIntrinsicCombine(N, DCI, Subtarget);
24712 case ISD::ANY_EXTEND:
24713 case ISD::ZERO_EXTEND:
24714 case ISD::SIGN_EXTEND:
24715 return performExtendCombine(N, DCI, DAG);
24717 return performSignExtendInRegCombine(N, DCI, DAG);
24719 return performConcatVectorsCombine(N, DCI, DAG);
24721 return performExtractSubvectorCombine(N, DCI, DAG);
24723 return performInsertSubvectorCombine(N, DCI, DAG);
24724 case ISD::SELECT:
24725 return performSelectCombine(N, DCI);
24726 case ISD::VSELECT:
24727 return performVSelectCombine(N, DCI.DAG);
24728 case ISD::SETCC:
24729 return performSETCCCombine(N, DCI, DAG);
24730 case ISD::LOAD:
24731 return performLOADCombine(N, DCI, DAG, Subtarget);
24732 case ISD::STORE:
24733 return performSTORECombine(N, DCI, DAG, Subtarget);
24734 case ISD::MSTORE:
24735 return performMSTORECombine(N, DCI, DAG, Subtarget);
24736 case ISD::MGATHER:
24737 case ISD::MSCATTER:
24738 return performMaskedGatherScatterCombine(N, DCI, DAG);
24739 case ISD::FP_EXTEND:
24740 return performFPExtendCombine(N, DAG, DCI, Subtarget);
24741 case AArch64ISD::BRCOND:
24742 return performBRCONDCombine(N, DCI, DAG);
24743 case AArch64ISD::TBNZ:
24744 case AArch64ISD::TBZ:
24745 return performTBZCombine(N, DCI, DAG);
24746 case AArch64ISD::CSEL:
24747 return performCSELCombine(N, DCI, DAG);
24748 case AArch64ISD::DUP:
24753 return performDUPCombine(N, DCI);
24755 return performDupLane128Combine(N, DAG);
24756 case AArch64ISD::NVCAST:
24757 return performNVCASTCombine(N, DAG);
24758 case AArch64ISD::SPLICE:
24759 return performSpliceCombine(N, DAG);
24762 return performUnpackCombine(N, DAG, Subtarget);
24763 case AArch64ISD::UZP1:
24764 case AArch64ISD::UZP2:
24765 return performUzpCombine(N, DAG, Subtarget);
24767 return performSetccMergeZeroCombine(N, DCI);
24784 return performGLD1Combine(N, DAG);
24785 case AArch64ISD::VASHR:
24786 case AArch64ISD::VLSHR:
24787 return performVectorShiftCombine(N, *this, DCI);
24789 return performSunpkloCombine(N, DAG);
24790 case AArch64ISD::BSP:
24791 return performBSPExpandForSVE(N, DAG, Subtarget);
24793 return performInsertVectorEltCombine(N, DCI);
24795 return performExtractVectorEltCombine(N, DCI, Subtarget);
24796 case ISD::VECREDUCE_ADD:
24797 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
24798 case AArch64ISD::UADDV:
24799 return performUADDVCombine(N, DAG);
24800 case AArch64ISD::SMULL:
24801 case AArch64ISD::UMULL:
24802 case AArch64ISD::PMULL:
24803 return performMULLCombine(N, DCI, DAG);
24806 switch (N->getConstantOperandVal(1)) {
24807 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
24808 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
24809 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
24810 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
24811 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
24812 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
24813 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
24814 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
24815 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
24816 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
24817 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
24818 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
24819 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
24820 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
24821 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
24822 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
24824 case Intrinsic::aarch64_neon_ld2:
24825 case Intrinsic::aarch64_neon_ld3:
24826 case Intrinsic::aarch64_neon_ld4:
24827 case Intrinsic::aarch64_neon_ld1x2:
24828 case Intrinsic::aarch64_neon_ld1x3:
24829 case Intrinsic::aarch64_neon_ld1x4:
24830 case Intrinsic::aarch64_neon_ld2lane:
24831 case Intrinsic::aarch64_neon_ld3lane:
24832 case Intrinsic::aarch64_neon_ld4lane:
24833 case Intrinsic::aarch64_neon_ld2r:
24834 case Intrinsic::aarch64_neon_ld3r:
24835 case Intrinsic::aarch64_neon_ld4r:
24836 case Intrinsic::aarch64_neon_st2:
24837 case Intrinsic::aarch64_neon_st3:
24838 case Intrinsic::aarch64_neon_st4:
24839 case Intrinsic::aarch64_neon_st1x2:
24840 case Intrinsic::aarch64_neon_st1x3:
24841 case Intrinsic::aarch64_neon_st1x4:
24842 case Intrinsic::aarch64_neon_st2lane:
24843 case Intrinsic::aarch64_neon_st3lane:
24844 case Intrinsic::aarch64_neon_st4lane:
24845 return performNEONPostLDSTCombine(N, DCI, DAG);
24846 case Intrinsic::aarch64_sve_ldnt1:
24847 return performLDNT1Combine(N, DAG);
24848 case Intrinsic::aarch64_sve_ld1rq:
24849 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
24850 case Intrinsic::aarch64_sve_ld1ro:
24851 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
24852 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
24854 case Intrinsic::aarch64_sve_ldnt1_gather:
24856 case Intrinsic::aarch64_sve_ldnt1_gather_index:
24857 return performGatherLoadCombine(N, DAG,
24859 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
24861 case Intrinsic::aarch64_sve_ld1:
24863 case Intrinsic::aarch64_sve_ldnf1:
24865 case Intrinsic::aarch64_sve_ldff1:
24867 case Intrinsic::aarch64_sve_st1:
24868 return performST1Combine(N, DAG);
24869 case Intrinsic::aarch64_sve_stnt1:
24870 return performSTNT1Combine(N, DAG);
24871 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
24873 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
24875 case Intrinsic::aarch64_sve_stnt1_scatter:
24877 case Intrinsic::aarch64_sve_stnt1_scatter_index:
24879 case Intrinsic::aarch64_sve_ld1_gather:
24881 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
24882 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
24884 case Intrinsic::aarch64_sve_ld1q_gather_index:
24885 return performGatherLoadCombine(N, DAG,
24887 case Intrinsic::aarch64_sve_ld1_gather_index:
24888 return performGatherLoadCombine(N, DAG,
24890 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
24892 /*OnlyPackedOffsets=*/false);
24893 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
24895 /*OnlyPackedOffsets=*/false);
24896 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
24897 return performGatherLoadCombine(N, DAG,
24899 /*OnlyPackedOffsets=*/false);
24900 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
24901 return performGatherLoadCombine(N, DAG,
24903 /*OnlyPackedOffsets=*/false);
24904 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
24906 case Intrinsic::aarch64_sve_ldff1_gather:
24908 case Intrinsic::aarch64_sve_ldff1_gather_index:
24909 return performGatherLoadCombine(N, DAG,
24911 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
24912 return performGatherLoadCombine(N, DAG,
24914 /*OnlyPackedOffsets=*/false);
24915 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
24916 return performGatherLoadCombine(N, DAG,
24918 /*OnlyPackedOffsets=*/false);
24919 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
24920 return performGatherLoadCombine(N, DAG,
24922 /*OnlyPackedOffsets=*/false);
24923 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
24924 return performGatherLoadCombine(N, DAG,
24926 /*OnlyPackedOffsets=*/false);
24927 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
24928 return performGatherLoadCombine(N, DAG,
24930 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
24931 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
24933 case Intrinsic::aarch64_sve_st1q_scatter_index:
24935 case Intrinsic::aarch64_sve_st1_scatter:
24937 case Intrinsic::aarch64_sve_st1_scatter_index:
24939 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
24941 /*OnlyPackedOffsets=*/false);
24942 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
24944 /*OnlyPackedOffsets=*/false);
24945 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
24946 return performScatterStoreCombine(N, DAG,
24948 /*OnlyPackedOffsets=*/false);
24949 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
24950 return performScatterStoreCombine(N, DAG,
24952 /*OnlyPackedOffsets=*/false);
24953 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
24955 case Intrinsic::aarch64_rndr:
24956 case Intrinsic::aarch64_rndrrs: {
24957 unsigned IntrinsicID = N->getConstantOperandVal(1);
24958 auto Register =
24959 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
24960 : AArch64SysReg::RNDRRS);
24961 SDLoc DL(N);
24962 SDValue A = DAG.getNode(
24963 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
24964 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
24965 SDValue B = DAG.getNode(
24966 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
24967 DAG.getConstant(0, DL, MVT::i32),
24968 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
24969 return DAG.getMergeValues(
24970 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
24971 }
24972 case Intrinsic::aarch64_sme_ldr_zt:
24974 DAG.getVTList(MVT::Other), N->getOperand(0),
24975 N->getOperand(2), N->getOperand(3));
24976 case Intrinsic::aarch64_sme_str_zt:
24977 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
24978 DAG.getVTList(MVT::Other), N->getOperand(0),
24979 N->getOperand(2), N->getOperand(3));
24980 default:
24981 break;
24982 }
24983 break;
24984 case ISD::GlobalAddress:
24985 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
24986 case ISD::CTLZ:
24987 return performCTLZCombine(N, DAG, Subtarget);
24989 return performScalarToVectorCombine(N, DCI, DAG);
24990 }
24991 return SDValue();
24992}
24993
24994// Check if the return value is used as only a return value, as otherwise
24995// we can't perform a tail-call. In particular, we need to check for
24996// target ISD nodes that are returns and any other "odd" constructs
24997// that the generic analysis code won't necessarily catch.
24998bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
24999 SDValue &Chain) const {
25000 if (N->getNumValues() != 1)
25001 return false;
25002 if (!N->hasNUsesOfValue(1, 0))
25003 return false;
25004
25005 SDValue TCChain = Chain;
25006 SDNode *Copy = *N->use_begin();
25007 if (Copy->getOpcode() == ISD::CopyToReg) {
25008 // If the copy has a glue operand, we conservatively assume it isn't safe to
25009 // perform a tail call.
25010 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
25011 MVT::Glue)
25012 return false;
25013 TCChain = Copy->getOperand(0);
25014 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
25015 return false;
25016
25017 bool HasRet = false;
25018 for (SDNode *Node : Copy->uses()) {
25019 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
25020 return false;
25021 HasRet = true;
25022 }
25023
25024 if (!HasRet)
25025 return false;
25026
25027 Chain = TCChain;
25028 return true;
25029}
25030
25031// Return whether the an instruction can potentially be optimized to a tail
25032// call. This will cause the optimizers to attempt to move, or duplicate,
25033// return instructions to help enable tail call optimizations for this
25034// instruction.
25035bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
25036 return CI->isTailCall();
25037}
25038
25039bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
25040 Register Offset, bool IsPre,
25041 MachineRegisterInfo &MRI) const {
25042 auto CstOffset = getIConstantVRegVal(Offset, MRI);
25043 if (!CstOffset || CstOffset->isZero())
25044 return false;
25045
25046 // All of the indexed addressing mode instructions take a signed 9 bit
25047 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
25048 // encodes the sign/indexing direction.
25049 return isInt<9>(CstOffset->getSExtValue());
25050}
25051
25052bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
25053 SDValue &Base,
25054 SDValue &Offset,
25055 SelectionDAG &DAG) const {
25056 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
25057 return false;
25058
25059 // Non-null if there is exactly one user of the loaded value (ignoring chain).
25060 SDNode *ValOnlyUser = nullptr;
25061 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
25062 ++UI) {
25063 if (UI.getUse().getResNo() == 1)
25064 continue; // Ignore chain.
25065 if (ValOnlyUser == nullptr)
25066 ValOnlyUser = *UI;
25067 else {
25068 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
25069 break;
25070 }
25071 }
25072
25073 auto IsUndefOrZero = [](SDValue V) {
25074 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
25075 };
25076
25077 // If the only user of the value is a scalable vector splat, it is
25078 // preferable to do a replicating load (ld1r*).
25079 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
25080 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
25081 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
25082 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
25083 return false;
25084
25085 Base = Op->getOperand(0);
25086 // All of the indexed addressing mode instructions take a signed
25087 // 9 bit immediate offset.
25088 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
25089 int64_t RHSC = RHS->getSExtValue();
25090 if (Op->getOpcode() == ISD::SUB)
25091 RHSC = -(uint64_t)RHSC;
25092 if (!isInt<9>(RHSC))
25093 return false;
25094 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25095 // when dealing with subtraction.
25096 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
25097 return true;
25098 }
25099 return false;
25100}
25101
25102bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25103 SDValue &Offset,
25105 SelectionDAG &DAG) const {
25106 EVT VT;
25107 SDValue Ptr;
25108 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25109 VT = LD->getMemoryVT();
25110 Ptr = LD->getBasePtr();
25111 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25112 VT = ST->getMemoryVT();
25113 Ptr = ST->getBasePtr();
25114 } else
25115 return false;
25116
25117 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
25118 return false;
25119 AM = ISD::PRE_INC;
25120 return true;
25121}
25122
25123bool AArch64TargetLowering::getPostIndexedAddressParts(
25125 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25126 EVT VT;
25127 SDValue Ptr;
25128 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25129 VT = LD->getMemoryVT();
25130 Ptr = LD->getBasePtr();
25131 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25132 VT = ST->getMemoryVT();
25133 Ptr = ST->getBasePtr();
25134 } else
25135 return false;
25136
25137 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25138 return false;
25139 // Post-indexing updates the base, so it's not a valid transform
25140 // if that's not the same as the load's pointer.
25141 if (Ptr != Base)
25142 return false;
25143 AM = ISD::POST_INC;
25144 return true;
25145}
25146
25149 SelectionDAG &DAG) {
25150 SDLoc DL(N);
25151 SDValue Op = N->getOperand(0);
25152 EVT VT = N->getValueType(0);
25153 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25154 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25155 "Must be bool vector.");
25156
25157 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25158 // elements, it adds a vector concatenation with undef(s). If we encounter
25159 // this here, we can skip the concat.
25160 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25161 bool AllUndef = true;
25162 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25163 AllUndef &= Op.getOperand(I).isUndef();
25164
25165 if (AllUndef)
25166 Op = Op.getOperand(0);
25167 }
25168
25169 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25170 if (VectorBits)
25171 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25172}
25173
25176 SelectionDAG &DAG, EVT ExtendVT,
25177 EVT CastVT) {
25178 SDLoc DL(N);
25179 SDValue Op = N->getOperand(0);
25180 EVT VT = N->getValueType(0);
25181
25182 // Use SCALAR_TO_VECTOR for lane zero
25183 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25184 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25185 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25186 Results.push_back(
25187 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25188}
25189
25190void AArch64TargetLowering::ReplaceBITCASTResults(
25192 SDLoc DL(N);
25193 SDValue Op = N->getOperand(0);
25194 EVT VT = N->getValueType(0);
25195 EVT SrcVT = Op.getValueType();
25196
25197 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25198 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25199 return;
25200 }
25201
25202 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25203 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25204 return;
25205 }
25206
25207 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25208 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25209 return;
25210 }
25211
25212 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25213 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25214 "Expected fp->int bitcast!");
25215
25216 // Bitcasting between unpacked vector types of different element counts is
25217 // not a NOP because the live elements are laid out differently.
25218 // 01234567
25219 // e.g. nxv2i32 = XX??XX??
25220 // nxv4f16 = X?X?X?X?
25221 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25222 return;
25223
25224 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25225 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25226 return;
25227 }
25228
25229 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25230 !VT.isVector())
25231 return replaceBoolVectorBitcast(N, Results, DAG);
25232
25233 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25234 return;
25235
25236 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25237 DAG.getUNDEF(MVT::i32), Op);
25238 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25239 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25240}
25241
25243 SelectionDAG &DAG,
25244 const AArch64Subtarget *Subtarget) {
25245 EVT VT = N->getValueType(0);
25246 if (!VT.is256BitVector() ||
25248 !N->getFlags().hasAllowReassociation()) ||
25249 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25250 VT.getScalarType() == MVT::bf16)
25251 return;
25252
25253 SDValue X = N->getOperand(0);
25254 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25255 if (!Shuf) {
25256 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25257 X = N->getOperand(1);
25258 if (!Shuf)
25259 return;
25260 }
25261
25262 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25263 return;
25264
25265 // Check the mask is 1,0,3,2,5,4,...
25266 ArrayRef<int> Mask = Shuf->getMask();
25267 for (int I = 0, E = Mask.size(); I < E; I++)
25268 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25269 return;
25270
25271 SDLoc DL(N);
25272 auto LoHi = DAG.SplitVector(X, DL);
25273 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25274 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25275 LoHi.first, LoHi.second);
25276
25277 // Shuffle the elements back into order.
25278 SmallVector<int> NMask;
25279 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25280 NMask.push_back(I);
25281 NMask.push_back(I);
25282 }
25283 Results.push_back(
25284 DAG.getVectorShuffle(VT, DL,
25285 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25286 DAG.getUNDEF(LoHi.first.getValueType())),
25287 DAG.getUNDEF(VT), NMask));
25288}
25289
25292 SelectionDAG &DAG, unsigned InterOp,
25293 unsigned AcrossOp) {
25294 EVT LoVT, HiVT;
25295 SDValue Lo, Hi;
25296 SDLoc dl(N);
25297 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25298 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25299 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25300 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25301 Results.push_back(SplitVal);
25302}
25303
25304void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25306 SDValue In = N->getOperand(0);
25307 EVT InVT = In.getValueType();
25308
25309 // Common code will handle these just fine.
25310 if (!InVT.isScalableVector() || !InVT.isInteger())
25311 return;
25312
25313 SDLoc DL(N);
25314 EVT VT = N->getValueType(0);
25315
25316 // The following checks bail if this is not a halving operation.
25317
25319
25320 if (InVT.getVectorElementCount() != (ResEC * 2))
25321 return;
25322
25323 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25324 if (!CIndex)
25325 return;
25326
25327 unsigned Index = CIndex->getZExtValue();
25328 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25329 return;
25330
25331 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25332 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25333
25334 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25335 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25336}
25337
25338// Create an even/odd pair of X registers holding integer value V.
25340 SDLoc dl(V.getNode());
25341 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25342 if (DAG.getDataLayout().isBigEndian())
25343 std::swap (VLo, VHi);
25344 SDValue RegClass =
25345 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25346 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25347 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25348 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25349 return SDValue(
25350 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25351}
25352
25355 SelectionDAG &DAG,
25356 const AArch64Subtarget *Subtarget) {
25357 assert(N->getValueType(0) == MVT::i128 &&
25358 "AtomicCmpSwap on types less than 128 should be legal");
25359
25360 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25361 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25362 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25363 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25364 SDValue Ops[] = {
25365 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25366 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25367 N->getOperand(1), // Ptr
25368 N->getOperand(0), // Chain in
25369 };
25370
25371 unsigned Opcode;
25372 switch (MemOp->getMergedOrdering()) {
25374 Opcode = AArch64::CASPX;
25375 break;
25377 Opcode = AArch64::CASPAX;
25378 break;
25380 Opcode = AArch64::CASPLX;
25381 break;
25384 Opcode = AArch64::CASPALX;
25385 break;
25386 default:
25387 llvm_unreachable("Unexpected ordering!");
25388 }
25389
25390 MachineSDNode *CmpSwap = DAG.getMachineNode(
25391 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25392 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25393
25394 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25395 if (DAG.getDataLayout().isBigEndian())
25396 std::swap(SubReg1, SubReg2);
25397 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
25398 SDValue(CmpSwap, 0));
25399 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
25400 SDValue(CmpSwap, 0));
25401 Results.push_back(
25402 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25403 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
25404 return;
25405 }
25406
25407 unsigned Opcode;
25408 switch (MemOp->getMergedOrdering()) {
25410 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
25411 break;
25413 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
25414 break;
25416 Opcode = AArch64::CMP_SWAP_128_RELEASE;
25417 break;
25420 Opcode = AArch64::CMP_SWAP_128;
25421 break;
25422 default:
25423 llvm_unreachable("Unexpected ordering!");
25424 }
25425
25426 SDLoc DL(N);
25427 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
25428 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
25429 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
25430 New.first, New.second, N->getOperand(0)};
25431 SDNode *CmpSwap = DAG.getMachineNode(
25432 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
25433 Ops);
25434 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
25435
25436 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25437 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
25438 Results.push_back(SDValue(CmpSwap, 3));
25439}
25440
25441static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
25442 AtomicOrdering Ordering) {
25443 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
25444 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
25445 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
25446 // ATOMIC_LOAD_CLR at any point.
25447 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
25448 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
25449 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
25450 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
25451
25452 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25453 // The operand will need to be XORed in a separate step.
25454 switch (Ordering) {
25456 return AArch64::LDCLRP;
25457 break;
25459 return AArch64::LDCLRPA;
25460 break;
25462 return AArch64::LDCLRPL;
25463 break;
25466 return AArch64::LDCLRPAL;
25467 break;
25468 default:
25469 llvm_unreachable("Unexpected ordering!");
25470 }
25471 }
25472
25473 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
25474 switch (Ordering) {
25476 return AArch64::LDSETP;
25477 break;
25479 return AArch64::LDSETPA;
25480 break;
25482 return AArch64::LDSETPL;
25483 break;
25486 return AArch64::LDSETPAL;
25487 break;
25488 default:
25489 llvm_unreachable("Unexpected ordering!");
25490 }
25491 }
25492
25493 if (ISDOpcode == ISD::ATOMIC_SWAP) {
25494 switch (Ordering) {
25496 return AArch64::SWPP;
25497 break;
25499 return AArch64::SWPPA;
25500 break;
25502 return AArch64::SWPPL;
25503 break;
25506 return AArch64::SWPPAL;
25507 break;
25508 default:
25509 llvm_unreachable("Unexpected ordering!");
25510 }
25511 }
25512
25513 llvm_unreachable("Unexpected ISDOpcode!");
25514}
25515
25518 SelectionDAG &DAG,
25519 const AArch64Subtarget *Subtarget) {
25520 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
25521 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
25522 // rather than the CASP instructions, because CASP has register classes for
25523 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
25524 // to present them as single operands. LSE128 instructions use the GPR64
25525 // register class (because the pair does not have to be sequential), like
25526 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
25527
25528 assert(N->getValueType(0) == MVT::i128 &&
25529 "AtomicLoadXXX on types less than 128 should be legal");
25530
25531 if (!Subtarget->hasLSE128())
25532 return;
25533
25534 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25535 const SDValue &Chain = N->getOperand(0);
25536 const SDValue &Ptr = N->getOperand(1);
25537 const SDValue &Val128 = N->getOperand(2);
25538 std::pair<SDValue, SDValue> Val2x64 =
25539 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
25540
25541 const unsigned ISDOpcode = N->getOpcode();
25542 const unsigned MachineOpcode =
25543 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
25544
25545 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
25546 SDLoc dl(Val128);
25547 Val2x64.first =
25548 DAG.getNode(ISD::XOR, dl, MVT::i64,
25549 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
25550 Val2x64.second =
25551 DAG.getNode(ISD::XOR, dl, MVT::i64,
25552 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
25553 }
25554
25555 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
25556 if (DAG.getDataLayout().isBigEndian())
25557 std::swap(Ops[0], Ops[1]);
25558
25559 MachineSDNode *AtomicInst =
25560 DAG.getMachineNode(MachineOpcode, SDLoc(N),
25561 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
25562
25563 DAG.setNodeMemRefs(AtomicInst, {MemOp});
25564
25565 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
25566 if (DAG.getDataLayout().isBigEndian())
25567 std::swap(Lo, Hi);
25568
25569 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
25570 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
25571}
25572
25573void AArch64TargetLowering::ReplaceNodeResults(
25575 switch (N->getOpcode()) {
25576 default:
25577 llvm_unreachable("Don't know how to custom expand this");
25578 case ISD::BITCAST:
25579 ReplaceBITCASTResults(N, Results, DAG);
25580 return;
25581 case ISD::VECREDUCE_ADD:
25586 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
25587 return;
25588 case ISD::ADD:
25589 case ISD::FADD:
25590 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
25591 return;
25592
25593 case ISD::CTPOP:
25594 case ISD::PARITY:
25595 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
25596 Results.push_back(Result);
25597 return;
25598 case AArch64ISD::SADDV:
25600 return;
25601 case AArch64ISD::UADDV:
25603 return;
25604 case AArch64ISD::SMINV:
25606 return;
25607 case AArch64ISD::UMINV:
25609 return;
25610 case AArch64ISD::SMAXV:
25612 return;
25613 case AArch64ISD::UMAXV:
25615 return;
25616 case ISD::MULHS:
25618 Results.push_back(
25619 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
25620 return;
25621 case ISD::MULHU:
25623 Results.push_back(
25624 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
25625 return;
25626 case ISD::FP_TO_UINT:
25627 case ISD::FP_TO_SINT:
25630 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
25631 // Let normal code take care of it by not adding anything to Results.
25632 return;
25634 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
25635 return;
25637 assert(N->getValueType(0) != MVT::i128 &&
25638 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
25639 break;
25642 case ISD::ATOMIC_SWAP: {
25643 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
25644 "Expected 128-bit atomicrmw.");
25645 // These need custom type legalisation so we go directly to instruction.
25646 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
25647 return;
25648 }
25649 case ISD::ATOMIC_LOAD:
25650 case ISD::LOAD: {
25651 MemSDNode *LoadNode = cast<MemSDNode>(N);
25652 EVT MemVT = LoadNode->getMemoryVT();
25653 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
25654 // targets.
25655 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
25656 MemVT.getSizeInBits() == 256u &&
25657 (MemVT.getScalarSizeInBits() == 8u ||
25658 MemVT.getScalarSizeInBits() == 16u ||
25659 MemVT.getScalarSizeInBits() == 32u ||
25660 MemVT.getScalarSizeInBits() == 64u)) {
25661
25664 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25665 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
25666 MVT::Other}),
25667 {LoadNode->getChain(), LoadNode->getBasePtr()},
25668 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25669
25670 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
25671 Result.getValue(0), Result.getValue(1));
25672 Results.append({Pair, Result.getValue(2) /* Chain */});
25673 return;
25674 }
25675
25676 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
25677 LoadNode->getMemoryVT() != MVT::i128) {
25678 // Non-volatile or atomic loads are optimized later in AArch64's load/store
25679 // optimizer.
25680 return;
25681 }
25682
25683 if (SDValue(N, 0).getValueType() == MVT::i128) {
25684 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
25685 bool isLoadAcquire =
25687 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
25688
25689 if (isLoadAcquire)
25690 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
25691
25693 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25694 {LoadNode->getChain(), LoadNode->getBasePtr()},
25695 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
25696
25697 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
25698
25699 SDValue Pair =
25700 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
25701 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
25702 Results.append({Pair, Result.getValue(2) /* Chain */});
25703 }
25704 return;
25705 }
25707 ReplaceExtractSubVectorResults(N, Results, DAG);
25708 return;
25711 // Custom lowering has been requested for INSERT_SUBVECTOR and
25712 // CONCAT_VECTORS -- but delegate to common code for result type
25713 // legalisation
25714 return;
25716 EVT VT = N->getValueType(0);
25717
25718 Intrinsic::ID IntID =
25719 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
25720 switch (IntID) {
25721 default:
25722 return;
25723 case Intrinsic::aarch64_sve_clasta_n: {
25724 assert((VT == MVT::i8 || VT == MVT::i16) &&
25725 "custom lowering for unexpected type");
25726 SDLoc DL(N);
25727 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25728 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
25729 N->getOperand(1), Op2, N->getOperand(3));
25730 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25731 return;
25732 }
25733 case Intrinsic::aarch64_sve_clastb_n: {
25734 assert((VT == MVT::i8 || VT == MVT::i16) &&
25735 "custom lowering for unexpected type");
25736 SDLoc DL(N);
25737 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
25738 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
25739 N->getOperand(1), Op2, N->getOperand(3));
25740 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25741 return;
25742 }
25743 case Intrinsic::aarch64_sve_lasta: {
25744 assert((VT == MVT::i8 || VT == MVT::i16) &&
25745 "custom lowering for unexpected type");
25746 SDLoc DL(N);
25747 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
25748 N->getOperand(1), N->getOperand(2));
25749 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25750 return;
25751 }
25752 case Intrinsic::aarch64_sve_lastb: {
25753 assert((VT == MVT::i8 || VT == MVT::i16) &&
25754 "custom lowering for unexpected type");
25755 SDLoc DL(N);
25756 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
25757 N->getOperand(1), N->getOperand(2));
25758 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25759 return;
25760 }
25761 case Intrinsic::get_active_lane_mask: {
25762 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
25763 return;
25764
25765 // NOTE: Only trivial type promotion is supported.
25766 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
25767 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
25768 return;
25769
25770 SDLoc DL(N);
25771 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
25772 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
25773 return;
25774 }
25775 }
25776 }
25777 case ISD::READ_REGISTER: {
25778 SDLoc DL(N);
25779 assert(N->getValueType(0) == MVT::i128 &&
25780 "READ_REGISTER custom lowering is only for 128-bit sysregs");
25781 SDValue Chain = N->getOperand(0);
25782 SDValue SysRegName = N->getOperand(1);
25783
25784 SDValue Result = DAG.getNode(
25785 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
25786 Chain, SysRegName);
25787
25788 // Sysregs are not endian. Result.getValue(0) always contains the lower half
25789 // of the 128-bit System Register value.
25790 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
25791 Result.getValue(0), Result.getValue(1));
25792 Results.push_back(Pair);
25793 Results.push_back(Result.getValue(2)); // Chain
25794 return;
25795 }
25796 }
25797}
25798
25800 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
25802 return true;
25803}
25804
25805unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
25806 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
25807 // reciprocal if there are three or more FDIVs.
25808 return 3;
25809}
25810
25813 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
25814 // v4i16, v2i32 instead of to promote.
25815 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
25816 VT == MVT::v1f32)
25817 return TypeWidenVector;
25818
25820}
25821
25822// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
25823// provided the address is 16-byte aligned.
25825 if (!Subtarget->hasLSE2())
25826 return false;
25827
25828 if (auto LI = dyn_cast<LoadInst>(I))
25829 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25830 LI->getAlign() >= Align(16);
25831
25832 if (auto SI = dyn_cast<StoreInst>(I))
25833 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25834 SI->getAlign() >= Align(16);
25835
25836 return false;
25837}
25838
25840 if (!Subtarget->hasLSE128())
25841 return false;
25842
25843 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
25844 // will clobber the two registers.
25845 if (const auto *SI = dyn_cast<StoreInst>(I))
25846 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25847 SI->getAlign() >= Align(16) &&
25848 (SI->getOrdering() == AtomicOrdering::Release ||
25849 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
25850
25851 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
25852 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25853 RMW->getAlign() >= Align(16) &&
25854 (RMW->getOperation() == AtomicRMWInst::Xchg ||
25855 RMW->getOperation() == AtomicRMWInst::And ||
25856 RMW->getOperation() == AtomicRMWInst::Or);
25857
25858 return false;
25859}
25860
25862 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
25863 return false;
25864
25865 if (auto LI = dyn_cast<LoadInst>(I))
25866 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
25867 LI->getAlign() >= Align(16) &&
25868 LI->getOrdering() == AtomicOrdering::Acquire;
25869
25870 if (auto SI = dyn_cast<StoreInst>(I))
25871 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
25872 SI->getAlign() >= Align(16) &&
25873 SI->getOrdering() == AtomicOrdering::Release;
25874
25875 return false;
25876}
25877
25879 const Instruction *I) const {
25881 return false;
25883 return false;
25885 return true;
25886 return false;
25887}
25888
25890 const Instruction *I) const {
25891 // Store-Release instructions only provide seq_cst guarantees when paired with
25892 // Load-Acquire instructions. MSVC CRT does not use these instructions to
25893 // implement seq_cst loads and stores, so we need additional explicit fences
25894 // after memory writes.
25895 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
25896 return false;
25897
25898 switch (I->getOpcode()) {
25899 default:
25900 return false;
25901 case Instruction::AtomicCmpXchg:
25902 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
25904 case Instruction::AtomicRMW:
25905 return cast<AtomicRMWInst>(I)->getOrdering() ==
25907 case Instruction::Store:
25908 return cast<StoreInst>(I)->getOrdering() ==
25910 }
25911}
25912
25913// Loads and stores less than 128-bits are already atomic; ones above that
25914// are doomed anyway, so defer to the default libcall and blame the OS when
25915// things go wrong.
25918 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
25919 if (Size != 128)
25921 if (isOpSuitableForRCPC3(SI))
25923 if (isOpSuitableForLSE128(SI))
25925 if (isOpSuitableForLDPSTP(SI))
25928}
25929
25930// Loads and stores less than 128-bits are already atomic; ones above that
25931// are doomed anyway, so defer to the default libcall and blame the OS when
25932// things go wrong.
25935 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
25936
25937 if (Size != 128)
25939 if (isOpSuitableForRCPC3(LI))
25941 // No LSE128 loads
25942 if (isOpSuitableForLDPSTP(LI))
25944
25945 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
25946 // implement atomicrmw without spilling. If the target address is also on the
25947 // stack and close enough to the spill slot, this can lead to a situation
25948 // where the monitor always gets cleared and the atomic operation can never
25949 // succeed. So at -O0 lower this operation to a CAS loop.
25950 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
25952
25953 // Using CAS for an atomic load has a better chance of succeeding under high
25954 // contention situations. So use it if available.
25955 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
25957}
25958
25959// The "default" for integer RMW operations is to expand to an LL/SC loop.
25960// However, with the LSE instructions (or outline-atomics mode, which provides
25961// library routines in place of the LSE-instructions), we can directly emit many
25962// operations instead.
25963//
25964// Floating-point operations are always emitted to a cmpxchg loop, because they
25965// may trigger a trap which aborts an LLSC sequence.
25968 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
25969 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
25970
25971 if (AI->isFloatingPointOperation())
25973
25974 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
25978 if (CanUseLSE128)
25980
25981 // Nand is not supported in LSE.
25982 // Leave 128 bits to LLSC or CmpXChg.
25983 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
25984 if (Subtarget->hasLSE())
25986 if (Subtarget->outlineAtomics()) {
25987 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
25988 // Don't outline them unless
25989 // (1) high level <atomic> support approved:
25990 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
25991 // (2) low level libgcc and compiler-rt support implemented by:
25992 // min/max outline atomics helpers
25993 if (AI->getOperation() != AtomicRMWInst::Min &&
25998 }
25999 }
26000 }
26001
26002 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26003 // implement atomicrmw without spilling. If the target address is also on the
26004 // stack and close enough to the spill slot, this can lead to a situation
26005 // where the monitor always gets cleared and the atomic operation can never
26006 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
26007 // we have a single CAS instruction that can replace the loop.
26009 Subtarget->hasLSE())
26011
26013}
26014
26017 AtomicCmpXchgInst *AI) const {
26018 // If subtarget has LSE, leave cmpxchg intact for codegen.
26019 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
26021 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26022 // implement cmpxchg without spilling. If the address being exchanged is also
26023 // on the stack and close enough to the spill slot, this can lead to a
26024 // situation where the monitor always gets cleared and the atomic operation
26025 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
26026 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26028
26029 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
26030 // it.
26032 if (Size > 64)
26034
26036}
26037
26039 Type *ValueTy, Value *Addr,
26040 AtomicOrdering Ord) const {
26041 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26042 bool IsAcquire = isAcquireOrStronger(Ord);
26043
26044 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
26045 // intrinsic must return {i64, i64} and we have to recombine them into a
26046 // single i128 here.
26047 if (ValueTy->getPrimitiveSizeInBits() == 128) {
26049 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
26051
26052 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
26053
26054 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
26055 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
26056 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
26057 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
26058 return Builder.CreateOr(
26059 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
26060 }
26061
26062 Type *Tys[] = { Addr->getType() };
26064 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
26065 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
26066
26067 const DataLayout &DL = M->getDataLayout();
26068 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
26069 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
26070 CI->addParamAttr(
26071 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
26072 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
26073
26074 return Builder.CreateBitCast(Trunc, ValueTy);
26075}
26076
26078 IRBuilderBase &Builder) const {
26079 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26080 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
26081}
26082
26084 Value *Val, Value *Addr,
26085 AtomicOrdering Ord) const {
26086 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26087 bool IsRelease = isReleaseOrStronger(Ord);
26088
26089 // Since the intrinsics must have legal type, the i128 intrinsics take two
26090 // parameters: "i64, i64". We must marshal Val into the appropriate form
26091 // before the call.
26092 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
26094 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
26096 Type *Int64Ty = Type::getInt64Ty(M->getContext());
26097
26098 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
26099 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
26100 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
26101 }
26102
26104 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26105 Type *Tys[] = { Addr->getType() };
26106 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
26107
26108 const DataLayout &DL = M->getDataLayout();
26109 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
26110 Val = Builder.CreateBitCast(Val, IntValTy);
26111
26112 CallInst *CI = Builder.CreateCall(
26113 Stxr, {Builder.CreateZExtOrBitCast(
26114 Val, Stxr->getFunctionType()->getParamType(0)),
26115 Addr});
26116 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
26117 Attribute::ElementType, Val->getType()));
26118 return CI;
26119}
26120
26122 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
26123 const DataLayout &DL) const {
26124 if (!Ty->isArrayTy()) {
26125 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26126 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26127 }
26128
26129 // All non aggregate members of the type must have the same type
26130 SmallVector<EVT> ValueVTs;
26131 ComputeValueVTs(*this, DL, Ty, ValueVTs);
26132 return all_equal(ValueVTs);
26133}
26134
26135bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26136 EVT) const {
26137 return false;
26138}
26139
26140static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26141 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26142 Function *ThreadPointerFunc =
26143 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
26144 return IRB.CreatePointerCast(
26145 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
26146 Offset),
26147 IRB.getPtrTy(0));
26148}
26149
26151 // Android provides a fixed TLS slot for the stack cookie. See the definition
26152 // of TLS_SLOT_STACK_GUARD in
26153 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26154 if (Subtarget->isTargetAndroid())
26155 return UseTlsOffset(IRB, 0x28);
26156
26157 // Fuchsia is similar.
26158 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26159 if (Subtarget->isTargetFuchsia())
26160 return UseTlsOffset(IRB, -0x10);
26161
26163}
26164
26166 // MSVC CRT provides functionalities for stack protection.
26167 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26168 // MSVC CRT has a global variable holding security cookie.
26169 M.getOrInsertGlobal("__security_cookie",
26170 PointerType::getUnqual(M.getContext()));
26171
26172 // MSVC CRT has a function to validate security cookie.
26173 FunctionCallee SecurityCheckCookie =
26174 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26175 Type::getVoidTy(M.getContext()),
26176 PointerType::getUnqual(M.getContext()));
26177 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26178 F->setCallingConv(CallingConv::Win64);
26179 F->addParamAttr(0, Attribute::AttrKind::InReg);
26180 }
26181 return;
26182 }
26184}
26185
26187 // MSVC CRT has a global variable holding security cookie.
26188 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26189 return M.getGlobalVariable("__security_cookie");
26191}
26192
26194 // MSVC CRT has a function to validate security cookie.
26195 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26196 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26198}
26199
26200Value *
26202 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26203 // definition of TLS_SLOT_SAFESTACK in
26204 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26205 if (Subtarget->isTargetAndroid())
26206 return UseTlsOffset(IRB, 0x48);
26207
26208 // Fuchsia is similar.
26209 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26210 if (Subtarget->isTargetFuchsia())
26211 return UseTlsOffset(IRB, -0x8);
26212
26214}
26215
26217 const Instruction &AndI) const {
26218 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26219 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26220 // may be beneficial to sink in other cases, but we would have to check that
26221 // the cmp would not get folded into the br to form a cbz for these to be
26222 // beneficial.
26223 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26224 if (!Mask)
26225 return false;
26226 return Mask->getValue().isPowerOf2();
26227}
26228
26232 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26233 SelectionDAG &DAG) const {
26234 // Does baseline recommend not to perform the fold by default?
26236 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26237 return false;
26238 // Else, if this is a vector shift, prefer 'shl'.
26239 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26240}
26241
26244 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26246 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26249 ExpansionFactor);
26250}
26251
26253 // Update IsSplitCSR in AArch64unctionInfo.
26254 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26255 AFI->setIsSplitCSR(true);
26256}
26257
26259 MachineBasicBlock *Entry,
26260 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26261 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26262 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26263 if (!IStart)
26264 return;
26265
26266 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26267 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26268 MachineBasicBlock::iterator MBBI = Entry->begin();
26269 for (const MCPhysReg *I = IStart; *I; ++I) {
26270 const TargetRegisterClass *RC = nullptr;
26271 if (AArch64::GPR64RegClass.contains(*I))
26272 RC = &AArch64::GPR64RegClass;
26273 else if (AArch64::FPR64RegClass.contains(*I))
26274 RC = &AArch64::FPR64RegClass;
26275 else
26276 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26277
26278 Register NewVR = MRI->createVirtualRegister(RC);
26279 // Create copy from CSR to a virtual register.
26280 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26281 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26282 // nounwind. If we want to generalize this later, we may need to emit
26283 // CFI pseudo-instructions.
26284 assert(Entry->getParent()->getFunction().hasFnAttribute(
26285 Attribute::NoUnwind) &&
26286 "Function should be nounwind in insertCopiesSplitCSR!");
26287 Entry->addLiveIn(*I);
26288 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26289 .addReg(*I);
26290
26291 // Insert the copy-back instructions right before the terminator.
26292 for (auto *Exit : Exits)
26293 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26294 TII->get(TargetOpcode::COPY), *I)
26295 .addReg(NewVR);
26296 }
26297}
26298
26300 // Integer division on AArch64 is expensive. However, when aggressively
26301 // optimizing for code size, we prefer to use a div instruction, as it is
26302 // usually smaller than the alternative sequence.
26303 // The exception to this is vector division. Since AArch64 doesn't have vector
26304 // integer division, leaving the division as-is is a loss even in terms of
26305 // size, because it will have to be scalarized, while the alternative code
26306 // sequence can be performed in vector form.
26307 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26308 return OptSize && !VT.isVector();
26309}
26310
26312 // We want inc-of-add for scalars and sub-of-not for vectors.
26313 return VT.isScalarInteger();
26314}
26315
26317 EVT VT) const {
26318 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26319 // legalize.
26320 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26321 return false;
26322 if (FPVT == MVT::v8bf16)
26323 return false;
26324 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26325}
26326
26330 const TargetInstrInfo *TII) const {
26331 assert(MBBI->isCall() && MBBI->getCFIType() &&
26332 "Invalid call instruction for a KCFI check");
26333
26334 switch (MBBI->getOpcode()) {
26335 case AArch64::BLR:
26336 case AArch64::BLRNoIP:
26337 case AArch64::TCRETURNri:
26338 case AArch64::TCRETURNrix16x17:
26339 case AArch64::TCRETURNrix17:
26340 case AArch64::TCRETURNrinotx16:
26341 break;
26342 default:
26343 llvm_unreachable("Unexpected CFI call opcode");
26344 }
26345
26346 MachineOperand &Target = MBBI->getOperand(0);
26347 assert(Target.isReg() && "Invalid target operand for an indirect call");
26348 Target.setIsRenamable(false);
26349
26350 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26351 .addReg(Target.getReg())
26352 .addImm(MBBI->getCFIType())
26353 .getInstr();
26354}
26355
26357 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26358}
26359
26360unsigned
26362 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26363 return getPointerTy(DL).getSizeInBits();
26364
26365 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26366}
26367
26368void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26369 MachineFrameInfo &MFI = MF.getFrameInfo();
26370 // If we have any vulnerable SVE stack objects then the stack protector
26371 // needs to be placed at the top of the SVE stack area, as the SVE locals
26372 // are placed above the other locals, so we allocate it as if it were a
26373 // scalable vector.
26374 // FIXME: It may be worthwhile having a specific interface for this rather
26375 // than doing it here in finalizeLowering.
26376 if (MFI.hasStackProtectorIndex()) {
26377 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26383 break;
26384 }
26385 }
26386 }
26389}
26390
26391// Unlike X86, we let frame lowering assign offsets to all catch objects.
26393 return false;
26394}
26395
26396bool AArch64TargetLowering::shouldLocalize(
26397 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
26398 auto &MF = *MI.getMF();
26399 auto &MRI = MF.getRegInfo();
26400 auto maxUses = [](unsigned RematCost) {
26401 // A cost of 1 means remats are basically free.
26402 if (RematCost == 1)
26403 return std::numeric_limits<unsigned>::max();
26404 if (RematCost == 2)
26405 return 2U;
26406
26407 // Remat is too expensive, only sink if there's one user.
26408 if (RematCost > 2)
26409 return 1U;
26410 llvm_unreachable("Unexpected remat cost");
26411 };
26412
26413 unsigned Opc = MI.getOpcode();
26414 switch (Opc) {
26415 case TargetOpcode::G_GLOBAL_VALUE: {
26416 // On Darwin, TLS global vars get selected into function calls, which
26417 // we don't want localized, as they can get moved into the middle of a
26418 // another call sequence.
26419 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
26420 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
26421 return false;
26422 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
26423 }
26424 case TargetOpcode::G_FCONSTANT:
26425 case TargetOpcode::G_CONSTANT: {
26426 const ConstantInt *CI;
26427 unsigned AdditionalCost = 0;
26428
26429 if (Opc == TargetOpcode::G_CONSTANT)
26430 CI = MI.getOperand(1).getCImm();
26431 else {
26432 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26433 // We try to estimate cost of 32/64b fpimms, as they'll likely be
26434 // materialized as integers.
26435 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
26436 break;
26437 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
26438 bool OptForSize =
26441 OptForSize))
26442 return true; // Constant should be cheap.
26443 CI =
26444 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
26445 // FP materialization also costs an extra move, from gpr to fpr.
26446 AdditionalCost = 1;
26447 }
26448 APInt Imm = CI->getValue();
26451 assert(Cost.isValid() && "Expected a valid imm cost");
26452
26453 unsigned RematCost = *Cost.getValue();
26454 RematCost += AdditionalCost;
26455 Register Reg = MI.getOperand(0).getReg();
26456 unsigned MaxUses = maxUses(RematCost);
26457 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
26458 if (MaxUses == std::numeric_limits<unsigned>::max())
26459 --MaxUses;
26460 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
26461 }
26462 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
26463 // localizable.
26464 case AArch64::ADRP:
26465 case AArch64::G_ADD_LOW:
26466 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
26467 case TargetOpcode::G_PTR_ADD:
26468 return true;
26469 default:
26470 break;
26471 }
26473}
26474
26476 if (Inst.getType()->isScalableTy())
26477 return true;
26478
26479 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
26480 if (Inst.getOperand(i)->getType()->isScalableTy())
26481 return true;
26482
26483 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
26484 if (AI->getAllocatedType()->isScalableTy())
26485 return true;
26486 }
26487
26488 // Checks to allow the use of SME instructions
26489 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
26490 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
26491 auto CalleeAttrs = SMEAttrs(*Base);
26492 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
26493 CallerAttrs.requiresLazySave(CalleeAttrs) ||
26494 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
26495 return true;
26496 }
26497 return false;
26498}
26499
26500// Return the largest legal scalable vector type that matches VT's element type.
26504 "Expected legal fixed length vector!");
26505 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26506 default:
26507 llvm_unreachable("unexpected element type for SVE container");
26508 case MVT::i8:
26509 return EVT(MVT::nxv16i8);
26510 case MVT::i16:
26511 return EVT(MVT::nxv8i16);
26512 case MVT::i32:
26513 return EVT(MVT::nxv4i32);
26514 case MVT::i64:
26515 return EVT(MVT::nxv2i64);
26516 case MVT::bf16:
26517 return EVT(MVT::nxv8bf16);
26518 case MVT::f16:
26519 return EVT(MVT::nxv8f16);
26520 case MVT::f32:
26521 return EVT(MVT::nxv4f32);
26522 case MVT::f64:
26523 return EVT(MVT::nxv2f64);
26524 }
26525}
26526
26527// Return a PTRUE with active lanes corresponding to the extent of VT.
26529 EVT VT) {
26532 "Expected legal fixed length vector!");
26533
26534 std::optional<unsigned> PgPattern =
26536 assert(PgPattern && "Unexpected element count for SVE predicate");
26537
26538 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
26539 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
26540 // variants of instructions when available.
26541 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26542 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
26543 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
26544 if (MaxSVESize && MinSVESize == MaxSVESize &&
26545 MaxSVESize == VT.getSizeInBits())
26546 PgPattern = AArch64SVEPredPattern::all;
26547
26548 MVT MaskVT;
26549 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
26550 default:
26551 llvm_unreachable("unexpected element type for SVE predicate");
26552 case MVT::i8:
26553 MaskVT = MVT::nxv16i1;
26554 break;
26555 case MVT::i16:
26556 case MVT::f16:
26557 case MVT::bf16:
26558 MaskVT = MVT::nxv8i1;
26559 break;
26560 case MVT::i32:
26561 case MVT::f32:
26562 MaskVT = MVT::nxv4i1;
26563 break;
26564 case MVT::i64:
26565 case MVT::f64:
26566 MaskVT = MVT::nxv2i1;
26567 break;
26568 }
26569
26570 return getPTrue(DAG, DL, MaskVT, *PgPattern);
26571}
26572
26574 EVT VT) {
26576 "Expected legal scalable vector!");
26577 auto PredTy = VT.changeVectorElementType(MVT::i1);
26578 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
26579}
26580
26582 if (VT.isFixedLengthVector())
26583 return getPredicateForFixedLengthVector(DAG, DL, VT);
26584
26585 return getPredicateForScalableVector(DAG, DL, VT);
26586}
26587
26588// Grow V to consume an entire SVE register.
26590 assert(VT.isScalableVector() &&
26591 "Expected to convert into a scalable vector!");
26592 assert(V.getValueType().isFixedLengthVector() &&
26593 "Expected a fixed length vector operand!");
26594 SDLoc DL(V);
26595 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26596 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
26597}
26598
26599// Shrink V so it's just big enough to maintain a VT's worth of data.
26602 "Expected to convert into a fixed length vector!");
26603 assert(V.getValueType().isScalableVector() &&
26604 "Expected a scalable vector operand!");
26605 SDLoc DL(V);
26606 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
26607 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
26608}
26609
26610// Convert all fixed length vector loads larger than NEON to masked_loads.
26611SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
26612 SDValue Op, SelectionDAG &DAG) const {
26613 auto Load = cast<LoadSDNode>(Op);
26614
26615 SDLoc DL(Op);
26616 EVT VT = Op.getValueType();
26617 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26618 EVT LoadVT = ContainerVT;
26619 EVT MemVT = Load->getMemoryVT();
26620
26621 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26622
26623 if (VT.isFloatingPoint()) {
26624 LoadVT = ContainerVT.changeTypeToInteger();
26625 MemVT = MemVT.changeTypeToInteger();
26626 }
26627
26628 SDValue NewLoad = DAG.getMaskedLoad(
26629 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
26630 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
26631 Load->getAddressingMode(), Load->getExtensionType());
26632
26633 SDValue Result = NewLoad;
26634 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
26635 EVT ExtendVT = ContainerVT.changeVectorElementType(
26636 Load->getMemoryVT().getVectorElementType());
26637
26638 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
26640 Pg, Result, DAG.getUNDEF(ContainerVT));
26641 } else if (VT.isFloatingPoint()) {
26642 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
26643 }
26644
26645 Result = convertFromScalableVector(DAG, VT, Result);
26646 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26647 return DAG.getMergeValues(MergedValues, DL);
26648}
26649
26651 SelectionDAG &DAG) {
26652 SDLoc DL(Mask);
26653 EVT InVT = Mask.getValueType();
26654 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26655
26656 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
26657
26658 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26659 return Pg;
26660
26661 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
26662 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
26663
26665 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
26666}
26667
26668// Convert all fixed length vector loads larger than NEON to masked_loads.
26669SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
26670 SDValue Op, SelectionDAG &DAG) const {
26671 auto Load = cast<MaskedLoadSDNode>(Op);
26672
26673 SDLoc DL(Op);
26674 EVT VT = Op.getValueType();
26675 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26676
26677 SDValue Mask = Load->getMask();
26678 // If this is an extending load and the mask type is not the same as
26679 // load's type then we have to extend the mask type.
26680 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
26681 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
26682 "Incorrect mask type");
26683 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
26684 }
26686
26687 SDValue PassThru;
26688 bool IsPassThruZeroOrUndef = false;
26689
26690 if (Load->getPassThru()->isUndef()) {
26691 PassThru = DAG.getUNDEF(ContainerVT);
26692 IsPassThruZeroOrUndef = true;
26693 } else {
26694 if (ContainerVT.isInteger())
26695 PassThru = DAG.getConstant(0, DL, ContainerVT);
26696 else
26697 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
26698 if (isZerosVector(Load->getPassThru().getNode()))
26699 IsPassThruZeroOrUndef = true;
26700 }
26701
26702 SDValue NewLoad = DAG.getMaskedLoad(
26703 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
26704 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
26705 Load->getAddressingMode(), Load->getExtensionType());
26706
26707 SDValue Result = NewLoad;
26708 if (!IsPassThruZeroOrUndef) {
26709 SDValue OldPassThru =
26710 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
26711 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
26712 }
26713
26714 Result = convertFromScalableVector(DAG, VT, Result);
26715 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
26716 return DAG.getMergeValues(MergedValues, DL);
26717}
26718
26719// Convert all fixed length vector stores larger than NEON to masked_stores.
26720SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
26721 SDValue Op, SelectionDAG &DAG) const {
26722 auto Store = cast<StoreSDNode>(Op);
26723
26724 SDLoc DL(Op);
26725 EVT VT = Store->getValue().getValueType();
26726 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26727 EVT MemVT = Store->getMemoryVT();
26728
26729 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
26730 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26731
26732 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
26733 EVT TruncVT = ContainerVT.changeVectorElementType(
26734 Store->getMemoryVT().getVectorElementType());
26735 MemVT = MemVT.changeTypeToInteger();
26736 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
26737 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
26738 DAG.getUNDEF(TruncVT));
26739 NewValue =
26740 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26741 } else if (VT.isFloatingPoint()) {
26742 MemVT = MemVT.changeTypeToInteger();
26743 NewValue =
26744 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
26745 }
26746
26747 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
26748 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
26749 Store->getMemOperand(), Store->getAddressingMode(),
26750 Store->isTruncatingStore());
26751}
26752
26753SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
26754 SDValue Op, SelectionDAG &DAG) const {
26755 auto *Store = cast<MaskedStoreSDNode>(Op);
26756
26757 SDLoc DL(Op);
26758 EVT VT = Store->getValue().getValueType();
26759 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26760
26761 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
26763
26764 return DAG.getMaskedStore(
26765 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
26766 Mask, Store->getMemoryVT(), Store->getMemOperand(),
26767 Store->getAddressingMode(), Store->isTruncatingStore());
26768}
26769
26770SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
26771 SDValue Op, SelectionDAG &DAG) const {
26772 SDLoc dl(Op);
26773 EVT VT = Op.getValueType();
26774 EVT EltVT = VT.getVectorElementType();
26775
26776 bool Signed = Op.getOpcode() == ISD::SDIV;
26777 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
26778
26779 bool Negated;
26780 uint64_t SplatVal;
26781 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
26782 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26783 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
26784 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
26785
26786 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
26787 SDValue Res =
26788 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
26789 if (Negated)
26790 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
26791 DAG.getConstant(0, dl, ContainerVT), Res);
26792
26793 return convertFromScalableVector(DAG, VT, Res);
26794 }
26795
26796 // Scalable vector i32/i64 DIV is supported.
26797 if (EltVT == MVT::i32 || EltVT == MVT::i64)
26798 return LowerToPredicatedOp(Op, DAG, PredOpcode);
26799
26800 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
26801 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
26802 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
26803 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26804
26805 // If the wider type is legal: extend, op, and truncate.
26806 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
26807 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
26808 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
26809 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
26810 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
26811 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
26812 }
26813
26814 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
26815 &ExtendOpcode](SDValue Op) {
26816 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
26817 SDValue IdxHalf =
26818 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
26819 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
26820 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
26821 return std::pair<SDValue, SDValue>(
26822 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
26823 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
26824 };
26825
26826 // If wider type is not legal: split, extend, op, trunc and concat.
26827 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
26828 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
26829 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
26830 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
26831 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
26832 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
26833 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
26834}
26835
26836SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
26837 SDValue Op, SelectionDAG &DAG) const {
26838 EVT VT = Op.getValueType();
26839 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26840
26841 SDLoc DL(Op);
26842 SDValue Val = Op.getOperand(0);
26843 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26844 Val = convertToScalableVector(DAG, ContainerVT, Val);
26845
26846 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
26847 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
26848
26849 // Repeatedly unpack Val until the result is of the desired element type.
26850 switch (ContainerVT.getSimpleVT().SimpleTy) {
26851 default:
26852 llvm_unreachable("unimplemented container type");
26853 case MVT::nxv16i8:
26854 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
26855 if (VT.getVectorElementType() == MVT::i16)
26856 break;
26857 [[fallthrough]];
26858 case MVT::nxv8i16:
26859 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
26860 if (VT.getVectorElementType() == MVT::i32)
26861 break;
26862 [[fallthrough]];
26863 case MVT::nxv4i32:
26864 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
26865 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
26866 break;
26867 }
26868
26869 return convertFromScalableVector(DAG, VT, Val);
26870}
26871
26872SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
26873 SDValue Op, SelectionDAG &DAG) const {
26874 EVT VT = Op.getValueType();
26875 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26876
26877 SDLoc DL(Op);
26878 SDValue Val = Op.getOperand(0);
26879 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
26880 Val = convertToScalableVector(DAG, ContainerVT, Val);
26881
26882 // Repeatedly truncate Val until the result is of the desired element type.
26883 switch (ContainerVT.getSimpleVT().SimpleTy) {
26884 default:
26885 llvm_unreachable("unimplemented container type");
26886 case MVT::nxv2i64:
26887 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
26888 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
26889 if (VT.getVectorElementType() == MVT::i32)
26890 break;
26891 [[fallthrough]];
26892 case MVT::nxv4i32:
26893 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
26894 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
26895 if (VT.getVectorElementType() == MVT::i16)
26896 break;
26897 [[fallthrough]];
26898 case MVT::nxv8i16:
26899 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
26900 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
26901 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
26902 break;
26903 }
26904
26905 return convertFromScalableVector(DAG, VT, Val);
26906}
26907
26908SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
26909 SDValue Op, SelectionDAG &DAG) const {
26910 EVT VT = Op.getValueType();
26911 EVT InVT = Op.getOperand(0).getValueType();
26912 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
26913
26914 SDLoc DL(Op);
26915 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26916 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26917
26918 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
26919}
26920
26921SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
26922 SDValue Op, SelectionDAG &DAG) const {
26923 EVT VT = Op.getValueType();
26924 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
26925
26926 SDLoc DL(Op);
26927 EVT InVT = Op.getOperand(0).getValueType();
26928 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
26929 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
26930
26931 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
26932 Op.getOperand(1), Op.getOperand(2));
26933
26934 return convertFromScalableVector(DAG, VT, ScalableRes);
26935}
26936
26937// Convert vector operation 'Op' to an equivalent predicated operation whereby
26938// the original operation's type is used to construct a suitable predicate.
26939// NOTE: The results for inactive lanes are undefined.
26940SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
26941 SelectionDAG &DAG,
26942 unsigned NewOp) const {
26943 EVT VT = Op.getValueType();
26944 SDLoc DL(Op);
26945 auto Pg = getPredicateForVector(DAG, DL, VT);
26946
26947 if (VT.isFixedLengthVector()) {
26948 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
26949 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
26950
26951 // Create list of operands by converting existing ones to scalable types.
26953 for (const SDValue &V : Op->op_values()) {
26954 if (isa<CondCodeSDNode>(V)) {
26955 Operands.push_back(V);
26956 continue;
26957 }
26958
26959 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
26960 EVT VTArg = VTNode->getVT().getVectorElementType();
26961 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
26962 Operands.push_back(DAG.getValueType(NewVTArg));
26963 continue;
26964 }
26965
26966 assert(isTypeLegal(V.getValueType()) &&
26967 "Expected only legal fixed-width types");
26968 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
26969 }
26970
26971 if (isMergePassthruOpcode(NewOp))
26972 Operands.push_back(DAG.getUNDEF(ContainerVT));
26973
26974 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
26975 return convertFromScalableVector(DAG, VT, ScalableRes);
26976 }
26977
26978 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
26979
26981 for (const SDValue &V : Op->op_values()) {
26982 assert((!V.getValueType().isVector() ||
26983 V.getValueType().isScalableVector()) &&
26984 "Only scalable vectors are supported!");
26985 Operands.push_back(V);
26986 }
26987
26988 if (isMergePassthruOpcode(NewOp))
26989 Operands.push_back(DAG.getUNDEF(VT));
26990
26991 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
26992}
26993
26994// If a fixed length vector operation has no side effects when applied to
26995// undefined elements, we can safely use scalable vectors to perform the same
26996// operation without needing to worry about predication.
26997SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
26998 SelectionDAG &DAG) const {
26999 EVT VT = Op.getValueType();
27001 "Only expected to lower fixed length vector operation!");
27002 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27003
27004 // Create list of operands by converting existing ones to scalable types.
27006 for (const SDValue &V : Op->op_values()) {
27007 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
27008
27009 // Pass through non-vector operands.
27010 if (!V.getValueType().isVector()) {
27011 Ops.push_back(V);
27012 continue;
27013 }
27014
27015 // "cast" fixed length vector to a scalable vector.
27016 assert(V.getValueType().isFixedLengthVector() &&
27017 isTypeLegal(V.getValueType()) &&
27018 "Only fixed length vectors are supported!");
27019 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
27020 }
27021
27022 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
27023 return convertFromScalableVector(DAG, VT, ScalableRes);
27024}
27025
27026SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
27027 SelectionDAG &DAG) const {
27028 SDLoc DL(ScalarOp);
27029 SDValue AccOp = ScalarOp.getOperand(0);
27030 SDValue VecOp = ScalarOp.getOperand(1);
27031 EVT SrcVT = VecOp.getValueType();
27032 EVT ResVT = SrcVT.getVectorElementType();
27033
27034 EVT ContainerVT = SrcVT;
27035 if (SrcVT.isFixedLengthVector()) {
27036 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27037 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27038 }
27039
27040 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27041 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27042
27043 // Convert operands to Scalable.
27044 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
27045 DAG.getUNDEF(ContainerVT), AccOp, Zero);
27046
27047 // Perform reduction.
27048 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
27049 Pg, AccOp, VecOp);
27050
27051 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
27052}
27053
27054SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
27055 SelectionDAG &DAG) const {
27056 SDLoc DL(ReduceOp);
27057 SDValue Op = ReduceOp.getOperand(0);
27058 EVT OpVT = Op.getValueType();
27059 EVT VT = ReduceOp.getValueType();
27060
27061 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
27062 return SDValue();
27063
27064 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
27065
27066 switch (ReduceOp.getOpcode()) {
27067 default:
27068 return SDValue();
27069 case ISD::VECREDUCE_OR:
27070 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
27071 // The predicate can be 'Op' because
27072 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
27073 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
27074 else
27075 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
27076 case ISD::VECREDUCE_AND: {
27077 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
27078 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
27079 }
27080 case ISD::VECREDUCE_XOR: {
27081 SDValue ID =
27082 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
27083 if (OpVT == MVT::nxv1i1) {
27084 // Emulate a CNTP on .Q using .D and a different governing predicate.
27085 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
27086 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
27087 }
27088 SDValue Cntp =
27089 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
27090 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
27091 }
27092 }
27093
27094 return SDValue();
27095}
27096
27097SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27098 SDValue ScalarOp,
27099 SelectionDAG &DAG) const {
27100 SDLoc DL(ScalarOp);
27101 SDValue VecOp = ScalarOp.getOperand(0);
27102 EVT SrcVT = VecOp.getValueType();
27103
27105 SrcVT,
27106 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
27107 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27108 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27109 }
27110
27111 // UADDV always returns an i64 result.
27112 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27113 SrcVT.getVectorElementType();
27114 EVT RdxVT = SrcVT;
27115 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
27116 RdxVT = getPackedSVEVectorVT(ResVT);
27117
27118 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27119 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
27120 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
27121 Rdx, DAG.getConstant(0, DL, MVT::i64));
27122
27123 // The VEC_REDUCE nodes expect an element size result.
27124 if (ResVT != ScalarOp.getValueType())
27125 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
27126
27127 return Res;
27128}
27129
27130SDValue
27131AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27132 SelectionDAG &DAG) const {
27133 EVT VT = Op.getValueType();
27134 SDLoc DL(Op);
27135
27136 EVT InVT = Op.getOperand(1).getValueType();
27137 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27138 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
27139 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
27140
27141 // Convert the mask to a predicated (NOTE: We don't need to worry about
27142 // inactive lanes since VSELECT is safe when given undefined elements).
27143 EVT MaskVT = Op.getOperand(0).getValueType();
27144 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
27145 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
27147 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
27148
27149 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
27150 Mask, Op1, Op2);
27151
27152 return convertFromScalableVector(DAG, VT, ScalableRes);
27153}
27154
27155SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27156 SDValue Op, SelectionDAG &DAG) const {
27157 SDLoc DL(Op);
27158 EVT InVT = Op.getOperand(0).getValueType();
27159 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27160
27161 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27162 "Only expected to lower fixed length vector operation!");
27163 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27164 "Expected integer result of the same bit length as the inputs!");
27165
27166 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27167 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27168 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27169
27170 EVT CmpVT = Pg.getValueType();
27171 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27172 {Pg, Op1, Op2, Op.getOperand(2)});
27173
27174 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27175 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27176 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27177}
27178
27179SDValue
27180AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27181 SelectionDAG &DAG) const {
27182 SDLoc DL(Op);
27183 auto SrcOp = Op.getOperand(0);
27184 EVT VT = Op.getValueType();
27185 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27186 EVT ContainerSrcVT =
27187 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27188
27189 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27190 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27191 return convertFromScalableVector(DAG, VT, Op);
27192}
27193
27194SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27195 SDValue Op, SelectionDAG &DAG) const {
27196 SDLoc DL(Op);
27197 unsigned NumOperands = Op->getNumOperands();
27198
27199 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27200 "Unexpected number of operands in CONCAT_VECTORS");
27201
27202 auto SrcOp1 = Op.getOperand(0);
27203 auto SrcOp2 = Op.getOperand(1);
27204 EVT VT = Op.getValueType();
27205 EVT SrcVT = SrcOp1.getValueType();
27206
27207 if (NumOperands > 2) {
27209 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27210 for (unsigned I = 0; I < NumOperands; I += 2)
27211 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27212 Op->getOperand(I), Op->getOperand(I + 1)));
27213
27214 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27215 }
27216
27217 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27218
27220 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27221 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27222
27223 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27224
27225 return convertFromScalableVector(DAG, VT, Op);
27226}
27227
27228SDValue
27229AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27230 SelectionDAG &DAG) const {
27231 EVT VT = Op.getValueType();
27232 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27233
27234 SDLoc DL(Op);
27235 SDValue Val = Op.getOperand(0);
27236 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27237 EVT SrcVT = Val.getValueType();
27238 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27239 EVT ExtendVT = ContainerVT.changeVectorElementType(
27240 SrcVT.getVectorElementType());
27241
27242 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27243 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27244
27245 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27246 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27247 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27248 Pg, Val, DAG.getUNDEF(ContainerVT));
27249
27250 return convertFromScalableVector(DAG, VT, Val);
27251}
27252
27253SDValue
27254AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27255 SelectionDAG &DAG) const {
27256 EVT VT = Op.getValueType();
27257 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27258
27259 SDLoc DL(Op);
27260 SDValue Val = Op.getOperand(0);
27261 EVT SrcVT = Val.getValueType();
27262 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27263 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27265 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27266
27267 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27268 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27269 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27270 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27271 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27272
27273 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27274 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27275}
27276
27277SDValue
27278AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27279 SelectionDAG &DAG) const {
27280 EVT VT = Op.getValueType();
27281 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27282
27283 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27284 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27286
27287 SDLoc DL(Op);
27288 SDValue Val = Op.getOperand(0);
27289 EVT SrcVT = Val.getValueType();
27290 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27291 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27292
27293 if (VT.bitsGE(SrcVT)) {
27295
27296 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27297 VT.changeTypeToInteger(), Val);
27298
27299 // Safe to use a larger than specified operand because by promoting the
27300 // value nothing has changed from an arithmetic point of view.
27301 Val =
27302 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27303 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27304 DAG.getUNDEF(ContainerDstVT));
27305 return convertFromScalableVector(DAG, VT, Val);
27306 } else {
27307 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27308 ContainerDstVT.getVectorElementType());
27310
27311 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27312 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27313 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27314 Val = convertFromScalableVector(DAG, SrcVT, Val);
27315
27316 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27317 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27318 }
27319}
27320
27321SDValue
27322AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27323 SelectionDAG &DAG) const {
27324 SDLoc DL(Op);
27325 EVT OpVT = Op.getValueType();
27326 assert(OpVT.isScalableVector() &&
27327 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27328 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27329 Op.getOperand(1));
27330 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27331 Op.getOperand(1));
27332 return DAG.getMergeValues({Even, Odd}, DL);
27333}
27334
27335SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27336 SelectionDAG &DAG) const {
27337 SDLoc DL(Op);
27338 EVT OpVT = Op.getValueType();
27339 assert(OpVT.isScalableVector() &&
27340 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27341
27342 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27343 Op.getOperand(1));
27344 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27345 Op.getOperand(1));
27346 return DAG.getMergeValues({Lo, Hi}, DL);
27347}
27348
27349SDValue
27350AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
27351 SelectionDAG &DAG) const {
27352 EVT VT = Op.getValueType();
27353 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27354
27355 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
27356 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
27358
27359 SDLoc DL(Op);
27360 SDValue Val = Op.getOperand(0);
27361 EVT SrcVT = Val.getValueType();
27362 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27363 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27364
27365 if (VT.bitsGT(SrcVT)) {
27366 EVT CvtVT = ContainerDstVT.changeVectorElementType(
27367 ContainerSrcVT.getVectorElementType());
27369
27370 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27371 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
27372
27373 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
27374 Val = getSVESafeBitCast(CvtVT, Val, DAG);
27375 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27376 DAG.getUNDEF(ContainerDstVT));
27377 return convertFromScalableVector(DAG, VT, Val);
27378 } else {
27379 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
27381
27382 // Safe to use a larger than specified result since an fp_to_int where the
27383 // result doesn't fit into the destination is undefined.
27384 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27385 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27386 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27387
27388 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
27389 }
27390}
27391
27393 ArrayRef<int> ShuffleMask, EVT VT,
27394 EVT ContainerVT, SelectionDAG &DAG) {
27395 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27396 SDLoc DL(Op);
27397 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27398 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27399 bool IsSingleOp =
27400 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
27401
27402 if (!Subtarget.isNeonAvailable() && !MinSVESize)
27403 MinSVESize = 128;
27404
27405 // Ignore two operands if no SVE2 or all index numbers couldn't
27406 // be represented.
27407 if (!IsSingleOp && !Subtarget.hasSVE2())
27408 return SDValue();
27409
27410 EVT VTOp1 = Op.getOperand(0).getValueType();
27411 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
27412 unsigned IndexLen = MinSVESize / BitsPerElt;
27413 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
27414 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
27415 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
27416 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
27417 bool MinMaxEqual = (MinSVESize == MaxSVESize);
27418 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
27419 "Incorrectly legalised shuffle operation");
27420
27422 // If MinSVESize is not equal to MaxSVESize then we need to know which
27423 // TBL mask element needs adjustment.
27424 SmallVector<SDValue, 8> AddRuntimeVLMask;
27425
27426 // Bail out for 8-bits element types, because with 2048-bit SVE register
27427 // size 8 bits is only sufficient to index into the first source vector.
27428 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
27429 return SDValue();
27430
27431 for (int Index : ShuffleMask) {
27432 // Handling poison index value.
27433 if (Index < 0)
27434 Index = 0;
27435 // If the mask refers to elements in the second operand, then we have to
27436 // offset the index by the number of elements in a vector. If this is number
27437 // is not known at compile-time, we need to maintain a mask with 'VL' values
27438 // to add at runtime.
27439 if ((unsigned)Index >= ElementsPerVectorReg) {
27440 if (MinMaxEqual) {
27441 Index += IndexLen - ElementsPerVectorReg;
27442 } else {
27443 Index = Index - ElementsPerVectorReg;
27444 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
27445 }
27446 } else if (!MinMaxEqual)
27447 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27448 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
27449 // to 255, this might point to the last element of in the second operand
27450 // of the shufflevector, thus we are rejecting this transform.
27451 if ((unsigned)Index >= MaxOffset)
27452 return SDValue();
27453 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
27454 }
27455
27456 // Choosing an out-of-range index leads to the lane being zeroed vs zero
27457 // value where it would perform first lane duplication for out of
27458 // index elements. For i8 elements an out-of-range index could be a valid
27459 // for 2048-bit vector register size.
27460 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
27461 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
27462 if (!MinMaxEqual)
27463 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
27464 }
27465
27466 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
27467 SDValue VecMask =
27468 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27469 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
27470
27471 SDValue Shuffle;
27472 if (IsSingleOp)
27473 Shuffle =
27474 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27475 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
27476 Op1, SVEMask);
27477 else if (Subtarget.hasSVE2()) {
27478 if (!MinMaxEqual) {
27479 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
27480 SDValue VScale = (BitsPerElt == 64)
27481 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
27482 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
27483 SDValue VecMask =
27484 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
27485 SDValue MulByMask = DAG.getNode(
27486 ISD::MUL, DL, MaskType,
27487 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
27488 DAG.getBuildVector(MaskType, DL,
27489 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
27490 SDValue UpdatedVecMask =
27491 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
27492 SVEMask = convertToScalableVector(
27493 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
27494 }
27495 Shuffle =
27496 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
27497 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
27498 Op1, Op2, SVEMask);
27499 }
27500 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
27501 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
27502}
27503
27504SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
27505 SDValue Op, SelectionDAG &DAG) const {
27506 EVT VT = Op.getValueType();
27507 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27508
27509 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
27510 auto ShuffleMask = SVN->getMask();
27511
27512 SDLoc DL(Op);
27513 SDValue Op1 = Op.getOperand(0);
27514 SDValue Op2 = Op.getOperand(1);
27515
27516 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27517 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
27518 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
27519
27520 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
27521 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
27522 return MVT::i32;
27523 return ScalarTy;
27524 };
27525
27526 if (SVN->isSplat()) {
27527 unsigned Lane = std::max(0, SVN->getSplatIndex());
27528 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27529 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27530 DAG.getConstant(Lane, DL, MVT::i64));
27531 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
27532 return convertFromScalableVector(DAG, VT, Op);
27533 }
27534
27535 bool ReverseEXT = false;
27536 unsigned Imm;
27537 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
27538 Imm == VT.getVectorNumElements() - 1) {
27539 if (ReverseEXT)
27540 std::swap(Op1, Op2);
27541 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
27542 SDValue Scalar = DAG.getNode(
27543 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
27544 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
27545 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
27546 return convertFromScalableVector(DAG, VT, Op);
27547 }
27548
27549 unsigned EltSize = VT.getScalarSizeInBits();
27550 for (unsigned LaneSize : {64U, 32U, 16U}) {
27551 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
27552 EVT NewVT =
27554 unsigned RevOp;
27555 if (EltSize == 8)
27557 else if (EltSize == 16)
27559 else
27561
27562 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27563 Op = LowerToPredicatedOp(Op, DAG, RevOp);
27564 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27565 return convertFromScalableVector(DAG, VT, Op);
27566 }
27567 }
27568
27569 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
27570 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
27571 if (!VT.isFloatingPoint())
27572 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27573
27575 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
27576 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
27577 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
27578 return convertFromScalableVector(DAG, VT, Op);
27579 }
27580
27581 unsigned WhichResult;
27582 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
27583 WhichResult == 0)
27585 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
27586
27587 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
27588 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27590 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27591 }
27592
27593 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
27595 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
27596
27597 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27598 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
27600 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27601 }
27602
27603 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
27604 // represents the same logical operation as performed by a ZIP instruction. In
27605 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
27606 // equivalent to an AArch64 instruction. There's the extra component of
27607 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
27608 // only operated on 64/128bit vector types that have a direct mapping to a
27609 // target register and so an exact mapping is implied.
27610 // However, when using SVE for fixed length vectors, most legal vector types
27611 // are actually sub-vectors of a larger SVE register. When mapping
27612 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
27613 // how the mask's indices translate. Specifically, when the mapping requires
27614 // an exact meaning for a specific vector index (e.g. Index X is the last
27615 // vector element in the register) then such mappings are often only safe when
27616 // the exact SVE register size is know. The main exception to this is when
27617 // indices are logically relative to the first element of either
27618 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
27619 // when converting from fixed-length to scalable vector types (i.e. the start
27620 // of a fixed length vector is always the start of a scalable vector).
27621 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
27622 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
27623 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
27624 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
27625 Op2.isUndef()) {
27626 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
27627 return convertFromScalableVector(DAG, VT, Op);
27628 }
27629
27630 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
27631 WhichResult != 0)
27633 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
27634
27635 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
27636 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27638 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
27639 }
27640
27641 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
27643 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
27644
27645 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
27646 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
27648 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
27649 }
27650 }
27651
27652 // Avoid producing TBL instruction if we don't know SVE register minimal size,
27653 // unless NEON is not available and we can assume minimal SVE register size is
27654 // 128-bits.
27655 if (MinSVESize || !Subtarget->isNeonAvailable())
27656 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
27657 DAG);
27658
27659 return SDValue();
27660}
27661
27662SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
27663 SelectionDAG &DAG) const {
27664 SDLoc DL(Op);
27665 EVT InVT = Op.getValueType();
27666
27667 assert(VT.isScalableVector() && isTypeLegal(VT) &&
27668 InVT.isScalableVector() && isTypeLegal(InVT) &&
27669 "Only expect to cast between legal scalable vector types!");
27670 assert(VT.getVectorElementType() != MVT::i1 &&
27671 InVT.getVectorElementType() != MVT::i1 &&
27672 "For predicate bitcasts, use getSVEPredicateBitCast");
27673
27674 if (InVT == VT)
27675 return Op;
27676
27678 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
27679
27680 // Safe bitcasting between unpacked vector types of different element counts
27681 // is currently unsupported because the following is missing the necessary
27682 // work to ensure the result's elements live where they're supposed to within
27683 // an SVE register.
27684 // 01234567
27685 // e.g. nxv2i32 = XX??XX??
27686 // nxv4f16 = X?X?X?X?
27688 VT == PackedVT || InVT == PackedInVT) &&
27689 "Unexpected bitcast!");
27690
27691 // Pack input if required.
27692 if (InVT != PackedInVT)
27693 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
27694
27695 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
27696
27697 // Unpack result if required.
27698 if (VT != PackedVT)
27700
27701 return Op;
27702}
27703
27705 SDValue N) const {
27706 return ::isAllActivePredicate(DAG, N);
27707}
27708
27710 return ::getPromotedVTForPredicate(VT);
27711}
27712
27713bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
27714 SDValue Op, const APInt &OriginalDemandedBits,
27715 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
27716 unsigned Depth) const {
27717
27718 unsigned Opc = Op.getOpcode();
27719 switch (Opc) {
27720 case AArch64ISD::VSHL: {
27721 // Match (VSHL (VLSHR Val X) X)
27722 SDValue ShiftL = Op;
27723 SDValue ShiftR = Op->getOperand(0);
27724 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
27725 return false;
27726
27727 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
27728 return false;
27729
27730 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
27731 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
27732
27733 // Other cases can be handled as well, but this is not
27734 // implemented.
27735 if (ShiftRBits != ShiftLBits)
27736 return false;
27737
27738 unsigned ScalarSize = Op.getScalarValueSizeInBits();
27739 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
27740
27741 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
27742 APInt UnusedBits = ~OriginalDemandedBits;
27743
27744 if ((ZeroBits & UnusedBits) != ZeroBits)
27745 return false;
27746
27747 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
27748 // used - simplify to just Val.
27749 return TLO.CombineTo(Op, ShiftR->getOperand(0));
27750 }
27751 case AArch64ISD::BICi: {
27752 // Fold BICi if all destination bits already known to be zeroed
27753 SDValue Op0 = Op.getOperand(0);
27754 KnownBits KnownOp0 =
27755 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
27756 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
27757 uint64_t BitsToClear = Op->getConstantOperandVal(1)
27758 << Op->getConstantOperandVal(2);
27759 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
27760 if (APInt(Known.getBitWidth(), BitsToClear)
27761 .isSubsetOf(AlreadyZeroedBitsToClear))
27762 return TLO.CombineTo(Op, Op0);
27763
27764 Known = KnownOp0 &
27765 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
27766
27767 return false;
27768 }
27770 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
27771 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
27772 if (!MaxSVEVectorSizeInBits)
27773 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
27774 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
27775 // The SVE count intrinsics don't support the multiplier immediate so we
27776 // don't have to account for that here. The value returned may be slightly
27777 // over the true required bits, as this is based on the "ALL" pattern. The
27778 // other patterns are also exposed by these intrinsics, but they all
27779 // return a value that's strictly less than "ALL".
27780 unsigned RequiredBits = llvm::bit_width(MaxElements);
27781 unsigned BitWidth = Known.Zero.getBitWidth();
27782 if (RequiredBits < BitWidth)
27783 Known.Zero.setHighBits(BitWidth - RequiredBits);
27784 return false;
27785 }
27786 }
27787 }
27788
27790 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
27791}
27792
27793bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
27794 return Op.getOpcode() == AArch64ISD::DUP ||
27795 Op.getOpcode() == AArch64ISD::MOVI ||
27796 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
27797 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
27799}
27800
27802 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
27803 Subtarget->hasComplxNum();
27804}
27805
27808 auto *VTy = dyn_cast<VectorType>(Ty);
27809 if (!VTy)
27810 return false;
27811
27812 // If the vector is scalable, SVE is enabled, implying support for complex
27813 // numbers. Otherwise, we need to ensure complex number support is available
27814 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
27815 return false;
27816
27817 auto *ScalarTy = VTy->getScalarType();
27818 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
27819
27820 // We can only process vectors that have a bit size of 128 or higher (with an
27821 // additional 64 bits for Neon). Additionally, these vectors must have a
27822 // power-of-2 size, as we later split them into the smallest supported size
27823 // and merging them back together after applying complex operation.
27824 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
27825 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
27826 !llvm::isPowerOf2_32(VTyWidth))
27827 return false;
27828
27829 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
27830 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
27831 return 8 <= ScalarWidth && ScalarWidth <= 64;
27832 }
27833
27834 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
27835 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
27836}
27837
27840 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
27841 Value *Accumulator) const {
27842 VectorType *Ty = cast<VectorType>(InputA->getType());
27843 bool IsScalable = Ty->isScalableTy();
27844 bool IsInt = Ty->getElementType()->isIntegerTy();
27845
27846 unsigned TyWidth =
27848
27849 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
27850 "Vector type must be either 64 or a power of 2 that is at least 128");
27851
27852 if (TyWidth > 128) {
27853 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
27854 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
27855 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
27856 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
27857 auto *UpperSplitA =
27858 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
27859 auto *UpperSplitB =
27860 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
27861 Value *LowerSplitAcc = nullptr;
27862 Value *UpperSplitAcc = nullptr;
27863 if (Accumulator) {
27864 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
27865 UpperSplitAcc =
27866 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
27867 }
27868 auto *LowerSplitInt = createComplexDeinterleavingIR(
27869 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
27870 auto *UpperSplitInt = createComplexDeinterleavingIR(
27871 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
27872
27873 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
27874 B.getInt64(0));
27875 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
27876 }
27877
27878 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
27879 if (Accumulator == nullptr)
27881
27882 if (IsScalable) {
27883 if (IsInt)
27884 return B.CreateIntrinsic(
27885 Intrinsic::aarch64_sve_cmla_x, Ty,
27886 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27887
27888 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27889 return B.CreateIntrinsic(
27890 Intrinsic::aarch64_sve_fcmla, Ty,
27891 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
27892 }
27893
27894 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
27895 Intrinsic::aarch64_neon_vcmla_rot90,
27896 Intrinsic::aarch64_neon_vcmla_rot180,
27897 Intrinsic::aarch64_neon_vcmla_rot270};
27898
27899
27900 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
27901 {Accumulator, InputA, InputB});
27902 }
27903
27904 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
27905 if (IsScalable) {
27908 if (IsInt)
27909 return B.CreateIntrinsic(
27910 Intrinsic::aarch64_sve_cadd_x, Ty,
27911 {InputA, InputB, B.getInt32((int)Rotation * 90)});
27912
27913 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
27914 return B.CreateIntrinsic(
27915 Intrinsic::aarch64_sve_fcadd, Ty,
27916 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
27917 }
27918 return nullptr;
27919 }
27920
27923 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
27925 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
27926
27927 if (IntId == Intrinsic::not_intrinsic)
27928 return nullptr;
27929
27930 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
27931 }
27932
27933 return nullptr;
27934}
27935
27936bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
27937 unsigned Opc = N->getOpcode();
27938 if (ISD::isExtOpcode(Opc)) {
27939 if (any_of(N->uses(),
27940 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
27941 return false;
27942 }
27943 return true;
27944}
27945
27946unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
27947 return Subtarget->getMinimumJumpTableEntries();
27948}
27949
27952 EVT VT) const {
27953 bool NonUnitFixedLengthVector =
27955 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27957
27958 EVT VT1;
27959 MVT RegisterVT;
27960 unsigned NumIntermediates;
27961 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
27962 RegisterVT);
27963 return RegisterVT;
27964}
27965
27967 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
27968 bool NonUnitFixedLengthVector =
27970 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
27972
27973 EVT VT1;
27974 MVT VT2;
27975 unsigned NumIntermediates;
27977 NumIntermediates, VT2);
27978}
27979
27981 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
27982 unsigned &NumIntermediates, MVT &RegisterVT) const {
27984 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
27985 if (!RegisterVT.isFixedLengthVector() ||
27986 RegisterVT.getFixedSizeInBits() <= 128)
27987 return NumRegs;
27988
27989 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
27990 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
27991 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
27992
27993 // A size mismatch here implies either type promotion or widening and would
27994 // have resulted in scalarisation if larger vectors had not be available.
27995 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
27996 EVT EltTy = VT.getVectorElementType();
27998 if (!isTypeLegal(NewVT))
27999 NewVT = EltTy;
28000
28001 IntermediateVT = NewVT;
28002 NumIntermediates = VT.getVectorNumElements();
28003 RegisterVT = getRegisterType(Context, NewVT);
28004 return NumIntermediates;
28005 }
28006
28007 // SVE VLS support does not introduce a new ABI so we should use NEON sized
28008 // types for vector arguments and returns.
28009
28010 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
28011 NumIntermediates *= NumSubRegs;
28012 NumRegs *= NumSubRegs;
28013
28014 switch (RegisterVT.getVectorElementType().SimpleTy) {
28015 default:
28016 llvm_unreachable("unexpected element type for vector");
28017 case MVT::i8:
28018 IntermediateVT = RegisterVT = MVT::v16i8;
28019 break;
28020 case MVT::i16:
28021 IntermediateVT = RegisterVT = MVT::v8i16;
28022 break;
28023 case MVT::i32:
28024 IntermediateVT = RegisterVT = MVT::v4i32;
28025 break;
28026 case MVT::i64:
28027 IntermediateVT = RegisterVT = MVT::v2i64;
28028 break;
28029 case MVT::f16:
28030 IntermediateVT = RegisterVT = MVT::v8f16;
28031 break;
28032 case MVT::f32:
28033 IntermediateVT = RegisterVT = MVT::v4f32;
28034 break;
28035 case MVT::f64:
28036 IntermediateVT = RegisterVT = MVT::v2f64;
28037 break;
28038 case MVT::bf16:
28039 IntermediateVT = RegisterVT = MVT::v8bf16;
28040 break;
28041 }
28042
28043 return NumRegs;
28044}
28045
28047 const MachineFunction &MF) const {
28048 return !Subtarget->isTargetWindows() &&
28049 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
28050}
28051
28052#ifndef NDEBUG
28054 switch (N->getOpcode()) {
28055 default:
28056 break;
28060 case AArch64ISD::UUNPKHI: {
28061 assert(N->getNumValues() == 1 && "Expected one result!");
28062 assert(N->getNumOperands() == 1 && "Expected one operand!");
28063 EVT VT = N->getValueType(0);
28064 EVT OpVT = N->getOperand(0).getValueType();
28065 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
28066 VT.isInteger() && "Expected integer vectors!");
28067 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
28068 "Expected vectors of equal size!");
28069 // TODO: Enable assert once bogus creations have been fixed.
28070 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
28071 // "Expected result vector with half the lanes of its input!");
28072 break;
28073 }
28074 case AArch64ISD::TRN1:
28075 case AArch64ISD::TRN2:
28076 case AArch64ISD::UZP1:
28077 case AArch64ISD::UZP2:
28078 case AArch64ISD::ZIP1:
28079 case AArch64ISD::ZIP2: {
28080 assert(N->getNumValues() == 1 && "Expected one result!");
28081 assert(N->getNumOperands() == 2 && "Expected two operands!");
28082 EVT VT = N->getValueType(0);
28083 EVT Op0VT = N->getOperand(0).getValueType();
28084 EVT Op1VT = N->getOperand(1).getValueType();
28085 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
28086 "Expected vectors!");
28087 // TODO: Enable assert once bogus creations have been fixed.
28088 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
28089 break;
28090 }
28091 }
28092}
28093#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point divide by power of two into fixed-point to floating-point conversion.
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool createTblShuffleForZExt(ZExtInst *ZExt, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue tryCombineWhileLo(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isCMN(SDValue Op, ISD::CondCode CC)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static bool isConstant(const MachineInstr &MI)
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
const AArch64RegisterInfo * getRegisterInfo() const override
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
bool isCallingConvWin64(CallingConv::ID CC) const
const char * getChkStkName() const
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
unsigned getMinSVEVectorSizeInBits() const
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB, bool HasTile) const
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1144
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1703
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
an instruction to allocate memory on the stack
Definition: Instructions.h:59
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ And
*p = old & v
Definition: Instructions.h:768
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
bool isFloatingPointOperation() const
Definition: Instructions.h:922
BinOp getOperation() const
Definition: Instructions.h:845
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:311
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
bool empty() const
Definition: Function.h:809
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1921
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
arg_iterator arg_end()
Definition: Function.h:827
arg_iterator arg_begin()
Definition: Function.h:818
size_t size() const
Definition: Function.h:808
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:677
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1037
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2472
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1881
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1045
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2170
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2067
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1437
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:476
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2081
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1416
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2021
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2117
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2007
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1497
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:569
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
Value * getPointerOperand()
Definition: Instructions.h:280
Type * getPointerOperandType() const
Definition: Instructions.h:283
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:692
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:480
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:462
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:676
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:667
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:634
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
This class represents zero extension of integer types.
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:251
self_iterator getIterator()
Definition: ilist_node.h:109
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1133
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:477
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1346
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1377
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1162
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1028
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:437
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1362
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1366
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1032
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1376
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:478
@ STRICT_FLOG2
Definition: ISDOpcodes.h:422
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:412
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1407
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:663
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:451
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1359
@ WRITE_REGISTER
Definition: ISDOpcodes.h:119
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1363
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:759
@ STRICT_LROUND
Definition: ISDOpcodes.h:432
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:328
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:587
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:647
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ STRICT_FPOWI
Definition: ISDOpcodes.h:414
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1378
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:628
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1158
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1371
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1023
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1273
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:436
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:425
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:118
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:857
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:426
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1336
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1255
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1222
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ STRICT_LRINT
Definition: ISDOpcodes.h:434
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:592
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ STRICT_FROUND
Definition: ISDOpcodes.h:429
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:450
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1379
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:658
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:401
@ STRICT_FLOG10
Definition: ISDOpcodes.h:421
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:435
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:613
@ STRICT_FEXP2
Definition: ISDOpcodes.h:419
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:106
@ STRICT_LLROUND
Definition: ISDOpcodes.h:433
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:856
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1367
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1153
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:581
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:314
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1601
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1492
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1479
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1481
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1471
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:239
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1509
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:292
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:434
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:376
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:291
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64