llvm.org GIT mirror llvm / 5747888
X86-FMA3: Implemented commute transformation for EVEX/AVX512 FMA3 opcodes. This helped to improved memory-folding and register coalescing optimizations. Also, this patch fixed the tracker #17229. Reviewer: Craig Topper. Differential Revision: https://reviews.llvm.org/D23108 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278431 91177308-0d34-0410-b5e6-96231b3b80d8 Vyacheslav Klochkov 4 years ago
10 changed file(s) with 795 addition(s) and 613 deletion(s). Raw diff Collapse all Expand all
2323 X86FrameLowering.cpp
2424 X86ISelDAGToDAG.cpp
2525 X86ISelLowering.cpp
26 X86InstrFMA3Info.cpp
2627 X86InstrInfo.cpp
2728 X86MCInstLower.cpp
2829 X86MachineFunctionInfo.cpp
193193 list ZeroMaskingPattern,
194194 string MaskingConstraint = "",
195195 InstrItinClass itin = NoItinerary,
196 bit IsCommutable = 0> {
196 bit IsCommutable = 0,
197 bit IsKCommutable = 0> {
197198 let isCommutable = IsCommutable in
198199 def NAME: AVX512
199200 OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
201202 Pattern, itin>;
202203
203204 // Prefer over VMOV*rrk Pat<>
204 let AddedComplexity = 20 in
205 let AddedComplexity = 20, isCommutable = IsKCommutable in
205206 def NAME#k: AVX512
206207 OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
207208 "$dst {${mask}}, "#IntelSrcAsm#"}",
209210 EVEX_K {
210211 // In case of the 3src subclass this is overridden with a let.
211212 string Constraints = MaskingConstraint;
212 }
213 let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
213 }
214
215 // Zero mask does not add any restrictions to commute operands transformation.
216 // So, it is Ok to use IsCommutable instead of IsKCommutable.
217 let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
214218 def NAME#kz: AVX512
215219 OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
216220 "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
230234 SDNode Select = vselect,
231235 string MaskingConstraint = "",
232236 InstrItinClass itin = NoItinerary,
233 bit IsCommutable = 0> :
237 bit IsCommutable = 0,
238 bit IsKCommutable = 0> :
234239 AVX512_maskable_custom
235240 AttSrcAsm, IntelSrcAsm,
236241 [(set _.RC:$dst, RHS)],
237242 [(set _.RC:$dst, MaskingRHS)],
238243 [(set _.RC:$dst,
239244 (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
240 MaskingConstraint, NoItinerary, IsCommutable>;
245 MaskingConstraint, NoItinerary, IsCommutable,
246 IsKCommutable>;
241247
242248 // This multiclass generates the unconditional/non-masking, the masking and
243249 // the zero-masking variant of the vector instruction. In the masking case, the
247253 string AttSrcAsm, string IntelSrcAsm,
248254 dag RHS,
249255 InstrItinClass itin = NoItinerary,
250 bit IsCommutable = 0, SDNode Select = vselect> :
256 bit IsCommutable = 0, bit IsKCommutable = 0,
257 SDNode Select = vselect> :
251258 AVX512_maskable_common
252259 !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
253260 !con((ins _.KRCWM:$mask), Ins),
254261 OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
255262 (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
256 "$src0 = $dst", itin, IsCommutable>;
263 "$src0 = $dst", itin, IsCommutable, IsKCommutable>;
257264
258265 // This multiclass generates the unconditional/non-masking, the masking and
259266 // the zero-masking variant of the scalar instruction.
277284 multiclass AVX512_maskable_3src O, Format F, X86VectorVTInfo _,
278285 dag Outs, dag NonTiedIns, string OpcodeStr,
279286 string AttSrcAsm, string IntelSrcAsm,
280 dag RHS> :
287 dag RHS, bit IsCommutable = 0,
288 bit IsKCommutable = 0> :
281289 AVX512_maskable_common
282290 !con((ins _.RC:$src1), NonTiedIns),
283291 !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
284292 !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
285293 OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
286 (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
287
288 // Similar to AVX512_maskable_3rc but in this case the input VT for the tied
294 (vselect _.KRCWM:$mask, RHS, _.RC:$src1),
295 vselect, "", NoItinerary, IsCommutable, IsKCommutable>;
296
297 // Similar to AVX512_maskable_3src but in this case the input VT for the tied
289298 // operand differs from the output VT. This requires a bitconvert on
290299 // the preserved vector going into the vselect.
291300 multiclass AVX512_maskable_3src_cast O, Format F, X86VectorVTInfo OutVT,
304313 multiclass AVX512_maskable_3src_scalar O, Format F, X86VectorVTInfo _,
305314 dag Outs, dag NonTiedIns, string OpcodeStr,
306315 string AttSrcAsm, string IntelSrcAsm,
307 dag RHS> :
316 dag RHS, bit IsCommutable = 0,
317 bit IsKCommutable = 0> :
308318 AVX512_maskable_common
309319 !con((ins _.RC:$src1), NonTiedIns),
310320 !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
311321 !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
312322 OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
313323 (X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
314 X86selects>;
324 X86selects, "", NoItinerary, IsCommutable,
325 IsKCommutable>;
315326
316327 multiclass AVX512_maskable_in_asm O, Format F, X86VectorVTInfo _,
317328 dag Outs, dag Ins,
48414852 defm r: AVX512_maskable_3src
48424853 (ins _.RC:$src2, _.RC:$src3),
48434854 OpcodeStr, "$src3, $src2", "$src2, $src3",
4844 (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3))>,
4855 (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
48454856 AVX512FMA3Base;
48464857
48474858 defm m: AVX512_maskable_3src
48484859 (ins _.RC:$src2, _.MemOp:$src3),
48494860 OpcodeStr, "$src3, $src2", "$src2, $src3",
4850 (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3)))>,
4861 (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
48514862 AVX512FMA3Base;
48524863
48534864 defm mb: AVX512_maskable_3src
48554866 OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
48564867 !strconcat("$src2, ${src3}", _.BroadcastStr ),
48574868 (OpNode _.RC:$src2,
4858 _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
4869 _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
48594870 AVX512FMA3Base, EVEX_B;
48604871 }
48614872
48744885 defm rb: AVX512_maskable_3src
48754886 (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
48764887 OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
4877 (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc)))>,
4888 (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
48784889 AVX512FMA3Base, EVEX_B, EVEX_RC;
48794890 }
48804891
49164927 defm r: AVX512_maskable_3src
49174928 (ins _.RC:$src2, _.RC:$src3),
49184929 OpcodeStr, "$src3, $src2", "$src2, $src3",
4919 (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>,
4930 (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
49204931 AVX512FMA3Base;
49214932
49224933 defm m: AVX512_maskable_3src
49234934 (ins _.RC:$src2, _.MemOp:$src3),
49244935 OpcodeStr, "$src3, $src2", "$src2, $src3",
4925 (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
4936 (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
49264937 AVX512FMA3Base;
49274938
49284939 defm mb: AVX512_maskable_3src
49314942 "$src2, ${src3}"##_.BroadcastStr,
49324943 (_.VT (OpNode _.RC:$src2,
49334944 (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
4934 _.RC:$src1))>, AVX512FMA3Base, EVEX_B;
4945 _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B;
49354946 }
49364947
49374948 // Additional patterns for folding broadcast nodes in other orders.
49594970 defm rb: AVX512_maskable_3src
49604971 (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
49614972 OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
4962 (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>,
4973 (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>,
49634974 AVX512FMA3Base, EVEX_B, EVEX_RC;
49644975 }
49654976
60356046 (X86cvtps2ph (_src.VT _src.RC:$src1),
60366047 (i32 imm:$src2),
60376048 (i32 FROUND_CURRENT)),
6038 NoItinerary, 0, X86select>, AVX512AIi8Base;
6049 NoItinerary, 0, 0, X86select>, AVX512AIi8Base;
60396050 def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
60406051 (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
60416052 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
60556066 (X86cvtps2ph (_src.VT _src.RC:$src1),
60566067 (i32 imm:$src2),
60576068 (i32 FROUND_NO_EXC)),
6058 NoItinerary, 0, X86select>, EVEX_B, AVX512AIi8Base;
6069 NoItinerary, 0, 0, X86select>, EVEX_B, AVX512AIi8Base;
60596070 }
60606071 let Predicates = [HasAVX512] in {
60616072 defm VCVTPS2PHZ : avx512_cvtps2ph,
0 //===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of the classes providing information
10 // about existing X86 FMA3 opcodes, classifying and grouping them.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "X86InstrFMA3Info.h"
15 #include "X86InstrInfo.h"
16 #include "llvm/Support/ManagedStatic.h"
17 #include "llvm/Support/Threading.h"
18
19 /// This flag is used in the method llvm::call_once() used below to make the
20 /// initialization of the map 'OpcodeToGroup' thread safe.
21 LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
22
23 static ManagedStatic X86InstrFMA3InfoObj;
24 X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
25 return &*X86InstrFMA3InfoObj;
26 }
27
28 void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes,
29 const uint16_t *MemOpcodes, unsigned Attr) {
30 // Create a new instance of this class that would hold a group of FMA opcodes.
31 X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr);
32
33 // Add the references from indvidual opcodes to the group holding them.
34 assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
35 !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] &&
36 !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) &&
37 "Duplication or rewrite of elements in OpcodeToGroup.");
38 OpcodeToGroup[RegOpcodes[0]] = G;
39 OpcodeToGroup[RegOpcodes[1]] = G;
40 OpcodeToGroup[RegOpcodes[2]] = G;
41 OpcodeToGroup[MemOpcodes[0]] = G;
42 OpcodeToGroup[MemOpcodes[1]] = G;
43 OpcodeToGroup[MemOpcodes[2]] = G;
44 }
45
46 void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) {
47 // Create a new instance of this class that would hold a group of FMA opcodes.
48 X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr);
49
50 // Add the references from indvidual opcodes to the group holding them.
51 assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
52 !OpcodeToGroup[RegOpcodes[2]]) &&
53 "Duplication or rewrite of elements in OpcodeToGroup.");
54 OpcodeToGroup[RegOpcodes[0]] = G;
55 OpcodeToGroup[RegOpcodes[1]] = G;
56 OpcodeToGroup[RegOpcodes[2]] = G;
57 }
58
59 void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) {
60 // Create a new instance of this class that would hold a group of FMA opcodes.
61 X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr);
62
63 // Add the references from indvidual opcodes to the group holding them.
64 assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] &&
65 !OpcodeToGroup[MemOpcodes[2]]) &&
66 "Duplication or rewrite of elements in OpcodeToGroup.");
67 OpcodeToGroup[MemOpcodes[0]] = G;
68 OpcodeToGroup[MemOpcodes[1]] = G;
69 OpcodeToGroup[MemOpcodes[2]] = G;
70 }
71
72 #define FMA3RM(R132, R213, R231, M132, M213, M231) \
73 static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
74 static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \
75 initRMGroup(Reg##R132, Mem##R132);
76
77 #define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs) \
78 static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
79 static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231}; \
80 initRMGroup(Reg##R132, Mem##R132, (Attrs));
81
82 #define FMA3R(R132, R213, R231) \
83 static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
84 initRGroup(Reg##R132);
85
86 #define FMA3RA(R132, R213, R231, Attrs) \
87 static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231}; \
88 initRGroup(Reg##R132, (Attrs));
89
90 #define FMA3M(M132, M213, M231) \
91 static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \
92 initMGroup(Mem##M132);
93
94 #define FMA3MA(M132, M213, M231, Attrs) \
95 static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231}; \
96 initMGroup(Mem##M132, (Attrs));
97
98 #define FMA3_AVX2_VECTOR_GROUP(Name) \
99 FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr, \
100 Name##132PSm, Name##213PSm, Name##231PSm); \
101 FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr, \
102 Name##132PDm, Name##213PDm, Name##231PDm); \
103 FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr, \
104 Name##132PSYm, Name##213PSYm, Name##231PSYm); \
105 FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr, \
106 Name##132PDYm, Name##213PDYm, Name##231PDYm);
107
108 #define FMA3_AVX2_SCALAR_GROUP(Name) \
109 FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr, \
110 Name##132SSm, Name##213SSm, Name##231SSm); \
111 FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr, \
112 Name##132SDm, Name##213SDm, Name##231SDm); \
113 FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int, \
114 Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int, \
115 X86InstrFMA3Group::X86FMA3Intrinsic); \
116 FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int, \
117 Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int, \
118 X86InstrFMA3Group::X86FMA3Intrinsic);
119
120 #define FMA3_AVX2_FULL_GROUP(Name) \
121 FMA3_AVX2_VECTOR_GROUP(Name); \
122 FMA3_AVX2_SCALAR_GROUP(Name);
123
124 #define FMA3_AVX512_VECTOR_GROUP(Name) \
125 FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r, \
126 Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m); \
127 FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r, \
128 Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m); \
129 FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r, \
130 Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m); \
131 FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r, \
132 Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m); \
133 FMA3RM(Name##132PSZr, Name##213PSZr, Name##231PSZr, \
134 Name##132PSZm, Name##213PSZm, Name##231PSZm); \
135 FMA3RM(Name##132PDZr, Name##213PDZr, Name##231PDZr, \
136 Name##132PDZm, Name##213PDZm, Name##231PDZm); \
137 FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk, \
138 Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk, \
139 X86InstrFMA3Group::X86FMA3KMergeMasked); \
140 FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk, \
141 Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk, \
142 X86InstrFMA3Group::X86FMA3KMergeMasked); \
143 FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk, \
144 Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk, \
145 X86InstrFMA3Group::X86FMA3KMergeMasked); \
146 FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk, \
147 Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk, \
148 X86InstrFMA3Group::X86FMA3KMergeMasked); \
149 FMA3RMA(Name##132PSZrk, Name##213PSZrk, Name##231PSZrk, \
150 Name##132PSZmk, Name##213PSZmk, Name##231PSZmk, \
151 X86InstrFMA3Group::X86FMA3KMergeMasked); \
152 FMA3RMA(Name##132PDZrk, Name##213PDZrk, Name##231PDZrk, \
153 Name##132PDZmk, Name##213PDZmk, Name##231PDZmk, \
154 X86InstrFMA3Group::X86FMA3KMergeMasked); \
155 FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz, \
156 Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz, \
157 X86InstrFMA3Group::X86FMA3KZeroMasked); \
158 FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz, \
159 Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz, \
160 X86InstrFMA3Group::X86FMA3KZeroMasked); \
161 FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz, \
162 Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz, \
163 X86InstrFMA3Group::X86FMA3KZeroMasked); \
164 FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz, \
165 Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz, \
166 X86InstrFMA3Group::X86FMA3KZeroMasked); \
167 FMA3RMA(Name##132PSZrkz, Name##213PSZrkz, Name##231PSZrkz, \
168 Name##132PSZmkz, Name##213PSZmkz, Name##231PSZmkz, \
169 X86InstrFMA3Group::X86FMA3KZeroMasked); \
170 FMA3RMA(Name##132PDZrkz, Name##213PDZrkz, Name##231PDZrkz, \
171 Name##132PDZmkz, Name##213PDZmkz, Name##231PDZmkz, \
172 X86InstrFMA3Group::X86FMA3KZeroMasked); \
173 FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb); \
174 FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb); \
175 FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk, \
176 X86InstrFMA3Group::X86FMA3KMergeMasked); \
177 FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk, \
178 X86InstrFMA3Group::X86FMA3KMergeMasked); \
179 FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz, \
180 X86InstrFMA3Group::X86FMA3KZeroMasked); \
181 FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz, \
182 X86InstrFMA3Group::X86FMA3KZeroMasked); \
183 FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb); \
184 FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb); \
185 FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb); \
186 FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb); \
187 FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb); \
188 FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb); \
189 FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk, \
190 X86InstrFMA3Group::X86FMA3KMergeMasked); \
191 FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk, \
192 X86InstrFMA3Group::X86FMA3KMergeMasked); \
193 FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk, \
194 X86InstrFMA3Group::X86FMA3KMergeMasked); \
195 FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk, \
196 X86InstrFMA3Group::X86FMA3KMergeMasked); \
197 FMA3MA(Name##132PSZmbk, Name##213PSZmbk, Name##231PSZmbk, \
198 X86InstrFMA3Group::X86FMA3KMergeMasked); \
199 FMA3MA(Name##132PDZmbk, Name##213PDZmbk, Name##231PDZmbk, \
200 X86InstrFMA3Group::X86FMA3KMergeMasked); \
201 FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz, \
202 X86InstrFMA3Group::X86FMA3KZeroMasked); \
203 FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz, \
204 X86InstrFMA3Group::X86FMA3KZeroMasked); \
205 FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz, \
206 X86InstrFMA3Group::X86FMA3KZeroMasked); \
207 FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz, \
208 X86InstrFMA3Group::X86FMA3KZeroMasked); \
209 FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz, \
210 X86InstrFMA3Group::X86FMA3KZeroMasked); \
211 FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz, \
212 X86InstrFMA3Group::X86FMA3KZeroMasked);
213
214 #define FMA3_AVX512_SCALAR_GROUP(Name) \
215 FMA3RM(Name##132SSZr, Name##213SSZr, Name##231SSZr, \
216 Name##132SSZm, Name##213SSZm, Name##231SSZm); \
217 FMA3RM(Name##132SDZr, Name##213SDZr, Name##231SDZr, \
218 Name##132SDZm, Name##213SDZm, Name##231SDZm); \
219 FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int, \
220 Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int, \
221 X86InstrFMA3Group::X86FMA3Intrinsic); \
222 FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int, \
223 Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int, \
224 X86InstrFMA3Group::X86FMA3Intrinsic); \
225 FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk, \
226 Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk, \
227 X86InstrFMA3Group::X86FMA3Intrinsic | \
228 X86InstrFMA3Group::X86FMA3KMergeMasked); \
229 FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk, \
230 Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk, \
231 X86InstrFMA3Group::X86FMA3Intrinsic | \
232 X86InstrFMA3Group::X86FMA3KMergeMasked); \
233 FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz, \
234 Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz, \
235 X86InstrFMA3Group::X86FMA3Intrinsic | \
236 X86InstrFMA3Group::X86FMA3KZeroMasked); \
237 FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz, \
238 Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz, \
239 X86InstrFMA3Group::X86FMA3Intrinsic | \
240 X86InstrFMA3Group::X86FMA3KZeroMasked); \
241 FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int, \
242 X86InstrFMA3Group::X86FMA3Intrinsic); \
243 FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int, \
244 X86InstrFMA3Group::X86FMA3Intrinsic); \
245 FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk, \
246 X86InstrFMA3Group::X86FMA3Intrinsic | \
247 X86InstrFMA3Group::X86FMA3KMergeMasked); \
248 FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk, \
249 X86InstrFMA3Group::X86FMA3Intrinsic | \
250 X86InstrFMA3Group::X86FMA3KMergeMasked); \
251 FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz, \
252 X86InstrFMA3Group::X86FMA3Intrinsic | \
253 X86InstrFMA3Group::X86FMA3KZeroMasked); \
254 FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz, \
255 X86InstrFMA3Group::X86FMA3Intrinsic | \
256 X86InstrFMA3Group::X86FMA3KZeroMasked);
257
258 #define FMA3_AVX512_FULL_GROUP(Name) \
259 FMA3_AVX512_VECTOR_GROUP(Name); \
260 FMA3_AVX512_SCALAR_GROUP(Name);
261
262 void X86InstrFMA3Info::initGroupsOnceImpl() {
263 FMA3_AVX2_FULL_GROUP(VFMADD);
264 FMA3_AVX2_FULL_GROUP(VFMSUB);
265 FMA3_AVX2_FULL_GROUP(VFNMADD);
266 FMA3_AVX2_FULL_GROUP(VFNMSUB);
267
268 FMA3_AVX2_VECTOR_GROUP(VFMADDSUB);
269 FMA3_AVX2_VECTOR_GROUP(VFMSUBADD);
270
271 FMA3_AVX512_FULL_GROUP(VFMADD);
272 FMA3_AVX512_FULL_GROUP(VFMSUB);
273 FMA3_AVX512_FULL_GROUP(VFNMADD);
274 FMA3_AVX512_FULL_GROUP(VFNMSUB);
275
276 FMA3_AVX512_VECTOR_GROUP(VFMADDSUB);
277 FMA3_AVX512_VECTOR_GROUP(VFMSUBADD);
278 }
279
280 void X86InstrFMA3Info::initGroupsOnce() {
281 llvm::call_once(InitGroupsOnceFlag,
282 []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); });
283 }
0 //===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the implementation of the classes providing information
10 // about existing X86 FMA3 opcodes, classifying and grouping them.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
15 #define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
16
17 #include "X86.h"
18 #include "llvm/ADT/DenseMap.h"
19 #include
20 #include
21
22 using namespace llvm;
23
24 /// This class is used to group {132, 213, 231} forms of FMA opcodes together.
25 /// Each of the groups has either 3 register opcodes, 3 memory opcodes,
26 /// or 6 register and memory opcodes. Also, each group has an attrubutes field
27 /// describing it.
28 class X86InstrFMA3Group {
29 private:
30 /// Reference to an array holding 3 forms of register FMA opcodes.
31 /// It may be set to nullptr if the group of FMA opcodes does not have
32 /// any register form opcodes.
33 const uint16_t *RegOpcodes;
34
35 /// Reference to an array holding 3 forms of memory FMA opcodes.
36 /// It may be set to nullptr if the group of FMA opcodes does not have
37 /// any register form opcodes.
38 const uint16_t *MemOpcodes;
39
40 /// This bitfield specifies the attributes associated with the created
41 /// FMA groups of opcodes.
42 unsigned Attributes;
43
44 static const unsigned Form132 = 0;
45 static const unsigned Form213 = 1;
46 static const unsigned Form231 = 2;
47
48 public:
49 /// This bit must be set in the 'Attributes' field of FMA group if such
50 /// group of FMA opcodes consists of FMA intrinsic opcodes.
51 static const unsigned X86FMA3Intrinsic = 0x1;
52
53 /// This bit must be set in the 'Attributes' field of FMA group if such
54 /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
55 /// passing the elements from the 1st operand to the result of the operation
56 /// when the correpondings bits in the k-mask are unset.
57 static const unsigned X86FMA3KMergeMasked = 0x2;
58
59 /// This bit must be set in the 'Attributes' field of FMA group if such
60 /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
61 static const unsigned X86FMA3KZeroMasked = 0x4;
62
63 /// Constructor. Creates a new group of FMA opcodes with three register form
64 /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes.
65 /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr,
66 /// which means that the created group of FMA opcodes does not have the
67 /// corresponding (register or memory) opcodes.
68 /// The parameter \p Attr specifies the attributes describing the created
69 /// group.
70 X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes,
71 unsigned Attr)
72 : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) {
73 assert((RegOpcodes || MemOpcodes) &&
74 "Cannot create a group not having any opcodes.");
75 }
76
77 /// Returns a memory form opcode that is the equivalent of the given register
78 /// form opcode \p RegOpcode. 0 is returned if the group does not have
79 /// either register of memory opcodes.
80 unsigned getMemOpcode(unsigned RegOpcode) const {
81 if (!RegOpcodes || !MemOpcodes)
82 return 0;
83 for (unsigned Form = 0; Form < 3; Form++)
84 if (RegOpcodes[Form] == RegOpcode)
85 return MemOpcodes[Form];
86 return 0;
87 }
88
89 /// Returns the 132 form of FMA register opcode.
90 unsigned getReg132Opcode() const {
91 assert(RegOpcodes && "The group does not have register opcodes.");
92 return RegOpcodes[Form132];
93 }
94
95 /// Returns the 213 form of FMA register opcode.
96 unsigned getReg213Opcode() const {
97 assert(RegOpcodes && "The group does not have register opcodes.");
98 return RegOpcodes[Form213];
99 }
100
101 /// Returns the 231 form of FMA register opcode.
102 unsigned getReg231Opcode() const {
103 assert(RegOpcodes && "The group does not have register opcodes.");
104 return RegOpcodes[Form231];
105 }
106
107 /// Returns the 132 form of FMA memory opcode.
108 unsigned getMem132Opcode() const {
109 assert(MemOpcodes && "The group does not have memory opcodes.");
110 return MemOpcodes[Form132];
111 }
112
113 /// Returns the 213 form of FMA memory opcode.
114 unsigned getMem213Opcode() const {
115 assert(MemOpcodes && "The group does not have memory opcodes.");
116 return MemOpcodes[Form213];
117 }
118
119 /// Returns the 231 form of FMA memory opcode.
120 unsigned getMem231Opcode() const {
121 assert(MemOpcodes && "The group does not have memory opcodes.");
122 return MemOpcodes[Form231];
123 }
124
125 /// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
126 bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; }
127
128 /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
129 bool isKMergeMasked() const {
130 return (Attributes & X86FMA3KMergeMasked) != 0;
131 }
132
133 /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
134 bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; }
135
136 /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
137 bool isKMasked() const {
138 return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0;
139 }
140
141 /// Returns true iff the given \p Opcode is a register opcode from the
142 /// groups of FMA opcodes.
143 bool isRegOpcodeFromGroup(unsigned Opcode) const {
144 if (!RegOpcodes)
145 return false;
146 for (unsigned Form = 0; Form < 3; Form++)
147 if (Opcode == RegOpcodes[Form])
148 return true;
149 return false;
150 }
151
152 /// Returns true iff the given \p Opcode is a memory opcode from the
153 /// groups of FMA opcodes.
154 bool isMemOpcodeFromGroup(unsigned Opcode) const {
155 if (!MemOpcodes)
156 return false;
157 for (unsigned Form = 0; Form < 3; Form++)
158 if (Opcode == MemOpcodes[Form])
159 return true;
160 return false;
161 }
162 };
163
164 /// This class provides information about all existing FMA3 opcodes
165 ///
166 class X86InstrFMA3Info {
167 private:
168 /// A map that is used to find the group of FMA opcodes using any FMA opcode
169 /// from the group.
170 DenseMap OpcodeToGroup;
171
172 /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
173 /// This method can be called many times, but the actual initialization is
174 /// called only once.
175 static void initGroupsOnce();
176
177 /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
178 /// This method must be called ONLY from initGroupsOnce(). Otherwise, such
179 /// call is not thread safe.
180 void initGroupsOnceImpl();
181
182 /// Creates one group of FMA opcodes having the register opcodes
183 /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr
184 /// specifies the attributes describing the created group.
185 void initRMGroup(const uint16_t *RegOpcodes,
186 const uint16_t *MemOpcodes, unsigned Attr = 0);
187
188 /// Creates one group of FMA opcodes having only the register opcodes
189 /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing
190 /// the created group.
191 void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0);
192
193 /// Creates one group of FMA opcodes having only the memory opcodes
194 /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing
195 /// the created group.
196 void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0);
197
198 public:
199 /// Returns the reference to an object of this class. It is assumed that
200 /// only one object may exist.
201 static X86InstrFMA3Info *getX86InstrFMA3Info();
202
203 /// Constructor. Just creates an object of the class.
204 X86InstrFMA3Info() {}
205
206 /// Destructor. Deallocates the memory used for FMA3 Groups.
207 ~X86InstrFMA3Info() {
208 std::set DeletedGroups;
209 auto E = OpcodeToGroup.end();
210 for (auto I = OpcodeToGroup.begin(); I != E; I++) {
211 const X86InstrFMA3Group *G = I->second;
212 if (DeletedGroups.find(G) == DeletedGroups.end()) {
213 DeletedGroups.insert(G);
214 delete G;
215 }
216 }
217 }
218
219 /// Returns a reference to a group of FMA3 opcodes to where the given
220 /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
221 /// and not included into any FMA3 group, then nullptr is returned.
222 static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) {
223 // Ensure that the groups of opcodes are initialized.
224 initGroupsOnce();
225
226 // Find the group including the given opcode.
227 const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
228 auto I = FMA3Info->OpcodeToGroup.find(Opcode);
229 if (I == FMA3Info->OpcodeToGroup.end())
230 return nullptr;
231
232 return I->second;
233 }
234
235 /// Returns true iff the given \p Opcode is recognized as FMA3 by this class.
236 static bool isFMA3(unsigned Opcode) {
237 return getFMA3Group(Opcode) != nullptr;
238 }
239
240 /// Iterator that is used to walk on FMA register opcodes having memory
241 /// form equivalents.
242 class rm_iterator {
243 private:
244 /// Iterator associated with the OpcodeToGroup map. It must always be
245 /// initialized with an entry from OpcodeToGroup for which I->first
246 /// points to a register FMA opcode and I->second points to a group of
247 /// FMA opcodes having memory form equivalent of I->first.
248 DenseMap::const_iterator I;
249
250 public:
251 /// Constructor. Creates rm_iterator. The parameter \p I must be an
252 /// iterator to OpcodeToGroup map entry having I->first pointing to
253 /// register form FMA opcode and I->second pointing to a group of FMA
254 /// opcodes holding memory form equivalent for I->fist.
255 rm_iterator(DenseMap::const_iterator I)
256 : I(I) {}
257
258 /// Returns the register form FMA opcode.
259 unsigned getRegOpcode() const { return I->first; };
260
261 /// Returns the memory form equivalent opcode for FMA register opcode
262 /// referenced by I->first.
263 unsigned getMemOpcode() const {
264 unsigned Opcode = I->first;
265 const X86InstrFMA3Group *Group = I->second;
266 return Group->getMemOpcode(Opcode);
267 }
268
269 /// Returns a reference to a group of FMA opcodes.
270 const X86InstrFMA3Group *getGroup() const { return I->second; }
271
272 bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; }
273 bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; }
274
275 /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry
276 /// having I->first pointing to register form FMA and I->second pointing
277 /// to a group of FMA opcodes holding memory form equivalen for I->first.
278 rm_iterator &operator++() {
279 auto E = getX86InstrFMA3Info()->OpcodeToGroup.end();
280 for (++I; I != E; ++I) {
281 unsigned RegOpcode = I->first;
282 const X86InstrFMA3Group *Group = I->second;
283 if (Group->getMemOpcode(RegOpcode) != 0)
284 break;
285 }
286 return *this;
287 }
288 };
289
290 /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map
291 /// with a register FMA opcode having memory form opcode equivalent.
292 static rm_iterator rm_begin() {
293 initGroupsOnce();
294 const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
295 auto I = FMA3Info->OpcodeToGroup.begin();
296 auto E = FMA3Info->OpcodeToGroup.end();
297 while (I != E) {
298 unsigned Opcode = I->first;
299 const X86InstrFMA3Group *G = I->second;
300 if (G->getMemOpcode(Opcode) != 0)
301 break;
302 I++;
303 }
304 return rm_iterator(I);
305 }
306
307 /// Returns the last rm_iterator.
308 static rm_iterator rm_end() {
309 initGroupsOnce();
310 return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
311 }
312 };
313
314 #endif
18541854 }
18551855
18561856 static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
1857 // FMA foldable instructions
1858 { X86::VFMADD231SSr, X86::VFMADD231SSm, TB_ALIGN_NONE },
1859 { X86::VFMADD231SSr_Int, X86::VFMADD231SSm_Int, TB_ALIGN_NONE },
1860 { X86::VFMADD231SDr, X86::VFMADD231SDm, TB_ALIGN_NONE },
1861 { X86::VFMADD231SDr_Int, X86::VFMADD231SDm_Int, TB_ALIGN_NONE },
1862 { X86::VFMADD132SSr, X86::VFMADD132SSm, TB_ALIGN_NONE },
1863 { X86::VFMADD132SSr_Int, X86::VFMADD132SSm_Int, TB_ALIGN_NONE },
1864 { X86::VFMADD132SDr, X86::VFMADD132SDm, TB_ALIGN_NONE },
1865 { X86::VFMADD132SDr_Int, X86::VFMADD132SDm_Int, TB_ALIGN_NONE },
1866 { X86::VFMADD213SSr, X86::VFMADD213SSm, TB_ALIGN_NONE },
1867 { X86::VFMADD213SSr_Int, X86::VFMADD213SSm_Int, TB_ALIGN_NONE },
1868 { X86::VFMADD213SDr, X86::VFMADD213SDm, TB_ALIGN_NONE },
1869 { X86::VFMADD213SDr_Int, X86::VFMADD213SDm_Int, TB_ALIGN_NONE },
1870 { X86::VFMADD231SSZr, X86::VFMADD231SSZm, TB_ALIGN_NONE },
1871 { X86::VFMADD231SSZr_Int, X86::VFMADD231SSZm_Int, TB_ALIGN_NONE },
1872 { X86::VFMADD231SDZr, X86::VFMADD231SDZm, TB_ALIGN_NONE },
1873 { X86::VFMADD231SDZr_Int, X86::VFMADD231SDZm_Int, TB_ALIGN_NONE },
1874 { X86::VFMADD132SSZr, X86::VFMADD132SSZm, TB_ALIGN_NONE },
1875 { X86::VFMADD132SSZr_Int, X86::VFMADD132SSZm_Int, TB_ALIGN_NONE },
1876 { X86::VFMADD132SDZr, X86::VFMADD132SDZm, TB_ALIGN_NONE },
1877 { X86::VFMADD132SDZr_Int, X86::VFMADD132SDZm_Int, TB_ALIGN_NONE },
1878 { X86::VFMADD213SSZr, X86::VFMADD213SSZm, TB_ALIGN_NONE },
1879 { X86::VFMADD213SSZr_Int, X86::VFMADD213SSZm_Int, TB_ALIGN_NONE },
1880 { X86::VFMADD213SDZr, X86::VFMADD213SDZm, TB_ALIGN_NONE },
1881 { X86::VFMADD213SDZr_Int, X86::VFMADD213SDZm_Int, TB_ALIGN_NONE },
1882
1883 { X86::VFMADD231PSr, X86::VFMADD231PSm, TB_ALIGN_NONE },
1884 { X86::VFMADD231PDr, X86::VFMADD231PDm, TB_ALIGN_NONE },
1885 { X86::VFMADD132PSr, X86::VFMADD132PSm, TB_ALIGN_NONE },
1886 { X86::VFMADD132PDr, X86::VFMADD132PDm, TB_ALIGN_NONE },
1887 { X86::VFMADD213PSr, X86::VFMADD213PSm, TB_ALIGN_NONE },
1888 { X86::VFMADD213PDr, X86::VFMADD213PDm, TB_ALIGN_NONE },
1889 { X86::VFMADD231PSYr, X86::VFMADD231PSYm, TB_ALIGN_NONE },
1890 { X86::VFMADD231PDYr, X86::VFMADD231PDYm, TB_ALIGN_NONE },
1891 { X86::VFMADD132PSYr, X86::VFMADD132PSYm, TB_ALIGN_NONE },
1892 { X86::VFMADD132PDYr, X86::VFMADD132PDYm, TB_ALIGN_NONE },
1893 { X86::VFMADD213PSYr, X86::VFMADD213PSYm, TB_ALIGN_NONE },
1894 { X86::VFMADD213PDYr, X86::VFMADD213PDYm, TB_ALIGN_NONE },
1895 { X86::VFMADD231PSZr, X86::VFMADD231PSZm, TB_ALIGN_NONE },
1896 { X86::VFMADD231PDZr, X86::VFMADD231PDZm, TB_ALIGN_NONE },
1897 { X86::VFMADD132PSZr, X86::VFMADD132PSZm, TB_ALIGN_NONE },
1898 { X86::VFMADD132PDZr, X86::VFMADD132PDZm, TB_ALIGN_NONE },
1899 { X86::VFMADD213PSZr, X86::VFMADD213PSZm, TB_ALIGN_NONE },
1900 { X86::VFMADD213PDZr, X86::VFMADD213PDZm, TB_ALIGN_NONE },
1901 { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128m, TB_ALIGN_NONE },
1902 { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128m, TB_ALIGN_NONE },
1903 { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128m, TB_ALIGN_NONE },
1904 { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128m, TB_ALIGN_NONE },
1905 { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128m, TB_ALIGN_NONE },
1906 { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128m, TB_ALIGN_NONE },
1907 { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256m, TB_ALIGN_NONE },
1908 { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256m, TB_ALIGN_NONE },
1909 { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256m, TB_ALIGN_NONE },
1910 { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256m, TB_ALIGN_NONE },
1911 { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256m, TB_ALIGN_NONE },
1912 { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256m, TB_ALIGN_NONE },
1913
1914 { X86::VFNMADD231SSr, X86::VFNMADD231SSm, TB_ALIGN_NONE },
1915 { X86::VFNMADD231SSr_Int, X86::VFNMADD231SSm_Int, TB_ALIGN_NONE },
1916 { X86::VFNMADD231SDr, X86::VFNMADD231SDm, TB_ALIGN_NONE },
1917 { X86::VFNMADD231SDr_Int, X86::VFNMADD231SDm_Int, TB_ALIGN_NONE },
1918 { X86::VFNMADD132SSr, X86::VFNMADD132SSm, TB_ALIGN_NONE },
1919 { X86::VFNMADD132SSr_Int, X86::VFNMADD132SSm_Int, TB_ALIGN_NONE },
1920 { X86::VFNMADD132SDr, X86::VFNMADD132SDm, TB_ALIGN_NONE },
1921 { X86::VFNMADD132SDr_Int, X86::VFNMADD132SDm_Int, TB_ALIGN_NONE },
1922 { X86::VFNMADD213SSr, X86::VFNMADD213SSm, TB_ALIGN_NONE },
1923 { X86::VFNMADD213SSr_Int, X86::VFNMADD213SSm_Int, TB_ALIGN_NONE },
1924 { X86::VFNMADD213SDr, X86::VFNMADD213SDm, TB_ALIGN_NONE },
1925 { X86::VFNMADD213SDr_Int, X86::VFNMADD213SDm_Int, TB_ALIGN_NONE },
1926 { X86::VFNMADD231SSZr, X86::VFNMADD231SSZm, TB_ALIGN_NONE },
1927 { X86::VFNMADD231SSZr_Int, X86::VFNMADD231SSZm_Int, TB_ALIGN_NONE },
1928 { X86::VFNMADD231SDZr, X86::VFNMADD231SDZm, TB_ALIGN_NONE },
1929 { X86::VFNMADD231SDZr_Int, X86::VFNMADD231SDZm_Int, TB_ALIGN_NONE },
1930 { X86::VFNMADD132SSZr, X86::VFNMADD132SSZm, TB_ALIGN_NONE },
1931 { X86::VFNMADD132SSZr_Int, X86::VFNMADD132SSZm_Int, TB_ALIGN_NONE },
1932 { X86::VFNMADD132SDZr, X86::VFNMADD132SDZm, TB_ALIGN_NONE },
1933 { X86::VFNMADD132SDZr_Int, X86::VFNMADD132SDZm_Int, TB_ALIGN_NONE },
1934 { X86::VFNMADD213SSZr, X86::VFNMADD213SSZm, TB_ALIGN_NONE },
1935 { X86::VFNMADD213SSZr_Int, X86::VFNMADD213SSZm_Int, TB_ALIGN_NONE },
1936 { X86::VFNMADD213SDZr, X86::VFNMADD213SDZm, TB_ALIGN_NONE },
1937 { X86::VFNMADD213SDZr_Int, X86::VFNMADD213SDZm_Int, TB_ALIGN_NONE },
1938
1939 { X86::VFNMADD231PSr, X86::VFNMADD231PSm, TB_ALIGN_NONE },
1940 { X86::VFNMADD231PDr, X86::VFNMADD231PDm, TB_ALIGN_NONE },
1941 { X86::VFNMADD132PSr, X86::VFNMADD132PSm, TB_ALIGN_NONE },
1942 { X86::VFNMADD132PDr, X86::VFNMADD132PDm, TB_ALIGN_NONE },
1943 { X86::VFNMADD213PSr, X86::VFNMADD213PSm, TB_ALIGN_NONE },
1944 { X86::VFNMADD213PDr, X86::VFNMADD213PDm, TB_ALIGN_NONE },
1945 { X86::VFNMADD231PSYr, X86::VFNMADD231PSYm, TB_ALIGN_NONE },
1946 { X86::VFNMADD231PDYr, X86::VFNMADD231PDYm, TB_ALIGN_NONE },
1947 { X86::VFNMADD132PSYr, X86::VFNMADD132PSYm, TB_ALIGN_NONE },
1948 { X86::VFNMADD132PDYr, X86::VFNMADD132PDYm, TB_ALIGN_NONE },
1949 { X86::VFNMADD213PSYr, X86::VFNMADD213PSYm, TB_ALIGN_NONE },
1950 { X86::VFNMADD213PDYr, X86::VFNMADD213PDYm, TB_ALIGN_NONE },
1951 { X86::VFNMADD231PSZr, X86::VFNMADD231PSZm, TB_ALIGN_NONE },
1952 { X86::VFNMADD231PDZr, X86::VFNMADD231PDZm, TB_ALIGN_NONE },
1953 { X86::VFNMADD132PSZr, X86::VFNMADD132PSZm, TB_ALIGN_NONE },
1954 { X86::VFNMADD132PDZr, X86::VFNMADD132PDZm, TB_ALIGN_NONE },
1955 { X86::VFNMADD213PSZr, X86::VFNMADD213PSZm, TB_ALIGN_NONE },
1956 { X86::VFNMADD213PDZr, X86::VFNMADD213PDZm, TB_ALIGN_NONE },
1957 { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128m, TB_ALIGN_NONE },
1958 { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128m, TB_ALIGN_NONE },
1959 { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128m, TB_ALIGN_NONE },
1960 { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128m, TB_ALIGN_NONE },
1961 { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128m, TB_ALIGN_NONE },
1962 { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128m, TB_ALIGN_NONE },
1963 { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256m, TB_ALIGN_NONE },
1964 { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256m, TB_ALIGN_NONE },
1965 { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256m, TB_ALIGN_NONE },
1966 { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256m, TB_ALIGN_NONE },
1967 { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256m, TB_ALIGN_NONE },
1968 { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256m, TB_ALIGN_NONE },
1969
1970 { X86::VFMSUB231SSr, X86::VFMSUB231SSm, TB_ALIGN_NONE },
1971 { X86::VFMSUB231SSr_Int, X86::VFMSUB231SSm_Int, TB_ALIGN_NONE },
1972 { X86::VFMSUB231SDr, X86::VFMSUB231SDm, TB_ALIGN_NONE },
1973 { X86::VFMSUB231SDr_Int, X86::VFMSUB231SDm_Int, TB_ALIGN_NONE },
1974 { X86::VFMSUB132SSr, X86::VFMSUB132SSm, TB_ALIGN_NONE },
1975 { X86::VFMSUB132SSr_Int, X86::VFMSUB132SSm_Int, TB_ALIGN_NONE },
1976 { X86::VFMSUB132SDr, X86::VFMSUB132SDm, TB_ALIGN_NONE },
1977 { X86::VFMSUB132SDr_Int, X86::VFMSUB132SDm_Int, TB_ALIGN_NONE },
1978 { X86::VFMSUB213SSr, X86::VFMSUB213SSm, TB_ALIGN_NONE },
1979 { X86::VFMSUB213SSr_Int, X86::VFMSUB213SSm_Int, TB_ALIGN_NONE },
1980 { X86::VFMSUB213SDr, X86::VFMSUB213SDm, TB_ALIGN_NONE },
1981 { X86::VFMSUB213SDr_Int, X86::VFMSUB213SDm_Int, TB_ALIGN_NONE },
1982 { X86::VFMSUB231SSZr, X86::VFMSUB231SSZm, TB_ALIGN_NONE },
1983 { X86::VFMSUB231SSZr_Int, X86::VFMSUB231SSZm_Int, TB_ALIGN_NONE },
1984 { X86::VFMSUB231SDZr, X86::VFMSUB231SDZm, TB_ALIGN_NONE },
1985 { X86::VFMSUB231SDZr_Int, X86::VFMSUB231SDZm_Int, TB_ALIGN_NONE },
1986 { X86::VFMSUB132SSZr, X86::VFMSUB132SSZm, TB_ALIGN_NONE },
1987 { X86::VFMSUB132SSZr_Int, X86::VFMSUB132SSZm_Int, TB_ALIGN_NONE },
1988 { X86::VFMSUB132SDZr, X86::VFMSUB132SDZm, TB_ALIGN_NONE },
1989 { X86::VFMSUB132SDZr_Int, X86::VFMSUB132SDZm_Int, TB_ALIGN_NONE },
1990 { X86::VFMSUB213SSZr, X86::VFMSUB213SSZm, TB_ALIGN_NONE },
1991 { X86::VFMSUB213SSZr_Int, X86::VFMSUB213SSZm_Int, TB_ALIGN_NONE },
1992 { X86::VFMSUB213SDZr, X86::VFMSUB213SDZm, TB_ALIGN_NONE },
1993 { X86::VFMSUB213SDZr_Int, X86::VFMSUB213SDZm_Int, TB_ALIGN_NONE },
1994
1995 { X86::VFMSUB231PSr, X86::VFMSUB231PSm, TB_ALIGN_NONE },
1996 { X86::VFMSUB231PDr, X86::VFMSUB231PDm, TB_ALIGN_NONE },
1997 { X86::VFMSUB132PSr, X86::VFMSUB132PSm, TB_ALIGN_NONE },
1998 { X86::VFMSUB132PDr, X86::VFMSUB132PDm, TB_ALIGN_NONE },
1999 { X86::VFMSUB213PSr, X86::VFMSUB213PSm, TB_ALIGN_NONE },
2000 { X86::VFMSUB213PDr, X86::VFMSUB213PDm, TB_ALIGN_NONE },
2001 { X86::VFMSUB231PSYr, X86::VFMSUB231PSYm, TB_ALIGN_NONE },
2002 { X86::VFMSUB231PDYr, X86::VFMSUB231PDYm, TB_ALIGN_NONE },
2003 { X86::VFMSUB132PSYr, X86::VFMSUB132PSYm, TB_ALIGN_NONE },
2004 { X86::VFMSUB132PDYr, X86::VFMSUB132PDYm, TB_ALIGN_NONE },
2005 { X86::VFMSUB213PSYr, X86::VFMSUB213PSYm, TB_ALIGN_NONE },
2006 { X86::VFMSUB213PDYr, X86::VFMSUB213PDYm, TB_ALIGN_NONE },
2007 { X86::VFMSUB231PSZr, X86::VFMSUB231PSZm, TB_ALIGN_NONE },
2008 { X86::VFMSUB231PDZr, X86::VFMSUB231PDZm, TB_ALIGN_NONE },
2009 { X86::VFMSUB132PSZr, X86::VFMSUB132PSZm, TB_ALIGN_NONE },
2010 { X86::VFMSUB132PDZr, X86::VFMSUB132PDZm, TB_ALIGN_NONE },
2011 { X86::VFMSUB213PSZr, X86::VFMSUB213PSZm, TB_ALIGN_NONE },
2012 { X86::VFMSUB213PDZr, X86::VFMSUB213PDZm, TB_ALIGN_NONE },
2013 { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128m, TB_ALIGN_NONE },
2014 { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128m, TB_ALIGN_NONE },
2015 { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128m, TB_ALIGN_NONE },
2016 { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128m, TB_ALIGN_NONE },
2017 { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128m, TB_ALIGN_NONE },
2018 { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128m, TB_ALIGN_NONE },
2019 { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256m, TB_ALIGN_NONE },
2020 { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256m, TB_ALIGN_NONE },
2021 { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256m, TB_ALIGN_NONE },
2022 { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256m, TB_ALIGN_NONE },
2023 { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256m, TB_ALIGN_NONE },
2024 { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256m, TB_ALIGN_NONE },
2025
2026 { X86::VFNMSUB231SSr, X86::VFNMSUB231SSm, TB_ALIGN_NONE },
2027 { X86::VFNMSUB231SSr_Int, X86::VFNMSUB231SSm_Int, TB_ALIGN_NONE },
2028 { X86::VFNMSUB231SDr, X86::VFNMSUB231SDm, TB_ALIGN_NONE },
2029 { X86::VFNMSUB231SDr_Int, X86::VFNMSUB231SDm_Int, TB_ALIGN_NONE },
2030 { X86::VFNMSUB132SSr, X86::VFNMSUB132SSm, TB_ALIGN_NONE },
2031 { X86::VFNMSUB132SSr_Int, X86::VFNMSUB132SSm_Int, TB_ALIGN_NONE },
2032 { X86::VFNMSUB132SDr, X86::VFNMSUB132SDm, TB_ALIGN_NONE },
2033 { X86::VFNMSUB132SDr_Int, X86::VFNMSUB132SDm_Int, TB_ALIGN_NONE },
2034 { X86::VFNMSUB213SSr, X86::VFNMSUB213SSm, TB_ALIGN_NONE },
2035 { X86::VFNMSUB213SSr_Int, X86::VFNMSUB213SSm_Int, TB_ALIGN_NONE },
2036 { X86::VFNMSUB213SDr, X86::VFNMSUB213SDm, TB_ALIGN_NONE },
2037 { X86::VFNMSUB213SDr_Int, X86::VFNMSUB213SDm_Int, TB_ALIGN_NONE },
2038
2039 { X86::VFNMSUB231PSr, X86::VFNMSUB231PSm, TB_ALIGN_NONE },
2040 { X86::VFNMSUB231PDr, X86::VFNMSUB231PDm, TB_ALIGN_NONE },
2041 { X86::VFNMSUB132PSr, X86::VFNMSUB132PSm, TB_ALIGN_NONE },
2042 { X86::VFNMSUB132PDr, X86::VFNMSUB132PDm, TB_ALIGN_NONE },
2043 { X86::VFNMSUB213PSr, X86::VFNMSUB213PSm, TB_ALIGN_NONE },
2044 { X86::VFNMSUB213PDr, X86::VFNMSUB213PDm, TB_ALIGN_NONE },
2045 { X86::VFNMSUB231PSYr, X86::VFNMSUB231PSYm, TB_ALIGN_NONE },
2046 { X86::VFNMSUB231PDYr, X86::VFNMSUB231PDYm, TB_ALIGN_NONE },
2047 { X86::VFNMSUB132PSYr, X86::VFNMSUB132PSYm, TB_ALIGN_NONE },
2048 { X86::VFNMSUB132PDYr, X86::VFNMSUB132PDYm, TB_ALIGN_NONE },
2049 { X86::VFNMSUB213PSYr, X86::VFNMSUB213PSYm, TB_ALIGN_NONE },
2050 { X86::VFNMSUB213PDYr, X86::VFNMSUB213PDYm, TB_ALIGN_NONE },
2051 { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZm, TB_ALIGN_NONE },
2052 { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZm, TB_ALIGN_NONE },
2053 { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZm, TB_ALIGN_NONE },
2054 { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZm, TB_ALIGN_NONE },
2055 { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZm, TB_ALIGN_NONE },
2056 { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZm, TB_ALIGN_NONE },
2057 { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128m, TB_ALIGN_NONE },
2058 { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128m, TB_ALIGN_NONE },
2059 { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128m, TB_ALIGN_NONE },
2060 { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128m, TB_ALIGN_NONE },
2061 { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128m, TB_ALIGN_NONE },
2062 { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128m, TB_ALIGN_NONE },
2063 { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256m, TB_ALIGN_NONE },
2064 { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256m, TB_ALIGN_NONE },
2065 { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256m, TB_ALIGN_NONE },
2066 { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256m, TB_ALIGN_NONE },
2067 { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256m, TB_ALIGN_NONE },
2068 { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256m, TB_ALIGN_NONE },
2069
2070 { X86::VFMADDSUB231PSr, X86::VFMADDSUB231PSm, TB_ALIGN_NONE },
2071 { X86::VFMADDSUB231PDr, X86::VFMADDSUB231PDm, TB_ALIGN_NONE },
2072 { X86::VFMADDSUB132PSr, X86::VFMADDSUB132PSm, TB_ALIGN_NONE },
2073 { X86::VFMADDSUB132PDr, X86::VFMADDSUB132PDm, TB_ALIGN_NONE },
2074 { X86::VFMADDSUB213PSr, X86::VFMADDSUB213PSm, TB_ALIGN_NONE },
2075 { X86::VFMADDSUB213PDr, X86::VFMADDSUB213PDm, TB_ALIGN_NONE },
2076 { X86::VFMADDSUB231PSYr, X86::VFMADDSUB231PSYm, TB_ALIGN_NONE },
2077 { X86::VFMADDSUB231PDYr, X86::VFMADDSUB231PDYm, TB_ALIGN_NONE },
2078 { X86::VFMADDSUB132PSYr, X86::VFMADDSUB132PSYm, TB_ALIGN_NONE },
2079 { X86::VFMADDSUB132PDYr, X86::VFMADDSUB132PDYm, TB_ALIGN_NONE },
2080 { X86::VFMADDSUB213PSYr, X86::VFMADDSUB213PSYm, TB_ALIGN_NONE },
2081 { X86::VFMADDSUB213PDYr, X86::VFMADDSUB213PDYm, TB_ALIGN_NONE },
2082 { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZm, TB_ALIGN_NONE },
2083 { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZm, TB_ALIGN_NONE },
2084 { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZm, TB_ALIGN_NONE },
2085 { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZm, TB_ALIGN_NONE },
2086 { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZm, TB_ALIGN_NONE },
2087 { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZm, TB_ALIGN_NONE },
2088 { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128m, TB_ALIGN_NONE },
2089 { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128m, TB_ALIGN_NONE },
2090 { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128m, TB_ALIGN_NONE },
2091 { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128m, TB_ALIGN_NONE },
2092 { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128m, TB_ALIGN_NONE },
2093 { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128m, TB_ALIGN_NONE },
2094 { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256m, TB_ALIGN_NONE },
2095 { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256m, TB_ALIGN_NONE },
2096 { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256m, TB_ALIGN_NONE },
2097 { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256m, TB_ALIGN_NONE },
2098 { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256m, TB_ALIGN_NONE },
2099 { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256m, TB_ALIGN_NONE },
2100
2101 { X86::VFMSUBADD231PSr, X86::VFMSUBADD231PSm, TB_ALIGN_NONE },
2102 { X86::VFMSUBADD231PDr, X86::VFMSUBADD231PDm, TB_ALIGN_NONE },
2103 { X86::VFMSUBADD132PSr, X86::VFMSUBADD132PSm, TB_ALIGN_NONE },
2104 { X86::VFMSUBADD132PDr, X86::VFMSUBADD132PDm, TB_ALIGN_NONE },
2105 { X86::VFMSUBADD213PSr, X86::VFMSUBADD213PSm, TB_ALIGN_NONE },
2106 { X86::VFMSUBADD213PDr, X86::VFMSUBADD213PDm, TB_ALIGN_NONE },
2107 { X86::VFMSUBADD231PSYr, X86::VFMSUBADD231PSYm, TB_ALIGN_NONE },
2108 { X86::VFMSUBADD231PDYr, X86::VFMSUBADD231PDYm, TB_ALIGN_NONE },
2109 { X86::VFMSUBADD132PSYr, X86::VFMSUBADD132PSYm, TB_ALIGN_NONE },
2110 { X86::VFMSUBADD132PDYr, X86::VFMSUBADD132PDYm, TB_ALIGN_NONE },
2111 { X86::VFMSUBADD213PSYr, X86::VFMSUBADD213PSYm, TB_ALIGN_NONE },
2112 { X86::VFMSUBADD213PDYr, X86::VFMSUBADD213PDYm, TB_ALIGN_NONE },
2113 { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZm, TB_ALIGN_NONE },
2114 { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZm, TB_ALIGN_NONE },
2115 { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZm, TB_ALIGN_NONE },
2116 { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZm, TB_ALIGN_NONE },
2117 { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZm, TB_ALIGN_NONE },
2118 { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZm, TB_ALIGN_NONE },
2119 { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128m, TB_ALIGN_NONE },
2120 { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128m, TB_ALIGN_NONE },
2121 { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128m, TB_ALIGN_NONE },
2122 { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128m, TB_ALIGN_NONE },
2123 { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128m, TB_ALIGN_NONE },
2124 { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128m, TB_ALIGN_NONE },
2125 { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256m, TB_ALIGN_NONE },
2126 { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256m, TB_ALIGN_NONE },
2127 { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256m, TB_ALIGN_NONE },
2128 { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256m, TB_ALIGN_NONE },
2129 { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256m, TB_ALIGN_NONE },
2130 { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256m, TB_ALIGN_NONE },
2131
21321857 // FMA4 foldable patterns
21331858 { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
21341859 { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
22331958 // Index 3, folded load
22341959 Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
22351960 }
1961 auto I = X86InstrFMA3Info::rm_begin();
1962 auto E = X86InstrFMA3Info::rm_end();
1963 for (; I != E; ++I)
1964 if (!I.getGroup()->isKMasked())
1965 AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
1966 I.getRegOpcode(), I.getMemOpcode(),
1967 TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
22361968
22371969 static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
22381970 // AVX-512 foldable instructions
22822014 // Index 4, folded load
22832015 Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
22842016 }
2017 for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I)
2018 if (I.getGroup()->isKMasked())
2019 AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
2020 I.getRegOpcode(), I.getMemOpcode(),
2021 TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
22852022 }
22862023
22872024 void
33443081 return NewMI;
33453082 }
33463083
3347 /// Returns true if the given instruction opcode is FMA3.
3348 /// Otherwise, returns false.
3349 /// The second parameter is optional and is used as the second return from
3350 /// the function. It is set to true if the given instruction has FMA3 opcode
3351 /// that is used for lowering of scalar FMA intrinsics, and it is set to false
3352 /// otherwise.
3353 static bool isFMA3(unsigned Opcode, bool &IsIntrinsic) {
3354 IsIntrinsic = false;
3355
3356 #define FMA3_CASE(Name, Modifier) \
3357 case X86::Name##r##Modifier: case X86::Name##m##Modifier:
3358
3359 #define FMA3_SCALAR_PAIR(Name, Size, Modifier) \
3360 FMA3_CASE(Name##SD##Size, Modifier) \
3361 FMA3_CASE(Name##SS##Size, Modifier)
3362
3363 #define FMA3_PACKED_PAIR(Name, Size) \
3364 FMA3_CASE(Name##PD##Size, ) \
3365 FMA3_CASE(Name##PS##Size, )
3366
3367 #define FMA3_PACKED_SET(Form, Size) \
3368 FMA3_PACKED_PAIR(VFMADD##Form, Size) \
3369 FMA3_PACKED_PAIR(VFMSUB##Form, Size) \
3370 FMA3_PACKED_PAIR(VFNMADD##Form, Size) \
3371 FMA3_PACKED_PAIR(VFNMSUB##Form, Size) \
3372 FMA3_PACKED_PAIR(VFMADDSUB##Form, Size) \
3373 FMA3_PACKED_PAIR(VFMSUBADD##Form, Size)
3374
3375 #define FMA3_CASES(Form) \
3376 FMA3_SCALAR_PAIR(VFMADD##Form, ,) \
3377 FMA3_SCALAR_PAIR(VFMSUB##Form, ,) \
3378 FMA3_SCALAR_PAIR(VFNMADD##Form, ,) \
3379 FMA3_SCALAR_PAIR(VFNMSUB##Form, ,) \
3380 FMA3_PACKED_SET(Form, ) \
3381 FMA3_PACKED_SET(Form, Y) \
3382
3383 #define FMA3_CASES_AVX512(Form) \
3384 FMA3_SCALAR_PAIR(VFMADD##Form, Z, ) \
3385 FMA3_SCALAR_PAIR(VFMSUB##Form, Z, ) \
3386 FMA3_SCALAR_PAIR(VFNMADD##Form, Z, ) \
3387 FMA3_SCALAR_PAIR(VFNMSUB##Form, Z, ) \
3388 FMA3_PACKED_SET(Form, Z128) \
3389 FMA3_PACKED_SET(Form, Z256) \
3390 FMA3_PACKED_SET(Form, Z)
3391
3392 #define FMA3_CASES_SCALAR_INT(Form) \
3393 FMA3_SCALAR_PAIR(VFMADD##Form, , _Int) \
3394 FMA3_SCALAR_PAIR(VFMSUB##Form, , _Int) \
3395 FMA3_SCALAR_PAIR(VFNMADD##Form, , _Int) \
3396 FMA3_SCALAR_PAIR(VFNMSUB##Form, , _Int)
3397
3398 #define FMA3_CASES_SCALAR_INT_AVX512(Form) \
3399 FMA3_SCALAR_PAIR(VFMADD##Form, Z, _Int) \
3400 FMA3_SCALAR_PAIR(VFMSUB##Form, Z, _Int) \
3401 FMA3_SCALAR_PAIR(VFNMADD##Form, Z, _Int) \
3402 FMA3_SCALAR_PAIR(VFNMSUB##Form, Z, _Int)
3403
3404 switch (Opcode) {
3405 FMA3_CASES(132)
3406 FMA3_CASES(213)
3407 FMA3_CASES(231)
3408
3409 // AVX-512 instructions
3410 FMA3_CASES_AVX512(132)
3411 FMA3_CASES_AVX512(213)
3412 FMA3_CASES_AVX512(231)
3413 return true;
3414
3415 FMA3_CASES_SCALAR_INT(132)
3416 FMA3_CASES_SCALAR_INT(213)
3417 FMA3_CASES_SCALAR_INT(231)
3418
3419 // AVX-512 instructions
3420 FMA3_CASES_SCALAR_INT_AVX512(132)
3421 FMA3_CASES_SCALAR_INT_AVX512(213)
3422 FMA3_CASES_SCALAR_INT_AVX512(231)
3423 IsIntrinsic = true;
3424 return true;
3425 default:
3426 return false;
3427 }
3428 llvm_unreachable("Opcode not handled by the switch");
3429
3430 #undef FMA3_CASE
3431 #undef FMA3_SCALAR_PAIR
3432 #undef FMA3_PACKED_PAIR
3433 #undef FMA3_PACKED_SET
3434 #undef FMA3_CASES
3435 #undef FMA3_CASES_AVX512
3436 #undef FMA3_CASES_SCALAR_INT
3437 #undef FMA3_CASES_SCALAR_INT_AVX512
3438 }
3439
3440 /// Returns an adjusted FMA opcode that must be used in FMA instruction that
3441 /// performs the same computations as the given MI but which has the operands
3442 /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
3443 /// It may return 0 if it is unsafe to commute the operands.
3444 ///
3445 /// The returned FMA opcode may differ from the opcode in the given \p MI.
3446 /// For example, commuting the operands #1 and #3 in the following FMA
3447 /// FMA213 #1, #2, #3
3448 /// results into instruction with adjusted opcode:
3449 /// FMA231 #3, #2, #1
3450 static unsigned getFMA3OpcodeToCommuteOperands(unsigned Opc,
3451 bool IsIntrinOpcode,
3452 unsigned SrcOpIdx1,
3453 unsigned SrcOpIdx2) {
3454 #define FMA3_ENTRY(Name, Suffix) \
3455 { X86::Name##132##Suffix, X86::Name##213##Suffix, X86::Name##231##Suffix },
3456
3457 #define FMA3_SCALAR_PAIR(Name, Suffix) \
3458 FMA3_ENTRY(Name, SS##Suffix) \
3459 FMA3_ENTRY(Name, SD##Suffix)
3460
3461 #define FMA3_PACKED_PAIR(Name, Suffix) \
3462 FMA3_ENTRY(Name, PS##Suffix) \
3463 FMA3_ENTRY(Name, PD##Suffix)
3464
3465 #define FMA3_PACKED_SIZES(Name, Suffix) \
3466 FMA3_PACKED_PAIR(Name, Suffix) \
3467 FMA3_PACKED_PAIR(Name, Y##Suffix)
3468
3469 #define FMA3_TABLE_ALL(Name) \
3470 FMA3_SCALAR_PAIR(Name, r) \
3471 FMA3_PACKED_SIZES(Name, r) \
3472 FMA3_SCALAR_PAIR(Name, m) \
3473 FMA3_PACKED_SIZES(Name, m)
3474
3475 #define FMA3_TABLE_PACKED(Name) \
3476 FMA3_PACKED_SIZES(Name, r) \
3477 FMA3_PACKED_SIZES(Name, m)
3478
3479 #define FMA3_TABLE_SCALAR_INT(Name) \
3480 FMA3_SCALAR_PAIR(Name, r_Int) \
3481 FMA3_SCALAR_PAIR(Name, m_Int)
3482
3483 #define FMA3_PACKED_SIZES_AVX512(Name, Suffix) \
3484 FMA3_PACKED_PAIR(Name, Z128##Suffix) \
3485 FMA3_PACKED_PAIR(Name, Z256##Suffix) \
3486 FMA3_PACKED_PAIR(Name, Z##Suffix)
3487
3488 #define FMA3_TABLE_ALL_AVX512(Name) \
3489 FMA3_SCALAR_PAIR(Name, Zr) \
3490 FMA3_PACKED_SIZES_AVX512(Name, r) \
3491 FMA3_SCALAR_PAIR(Name, Zm) \
3492 FMA3_PACKED_SIZES_AVX512(Name, m)
3493
3494 #define FMA3_TABLE_PACKED_AVX512(Name) \
3495 FMA3_PACKED_SIZES_AVX512(Name, r) \
3496 FMA3_PACKED_SIZES_AVX512(Name, m)
3497
3498 #define FMA3_TABLE_SCALAR_INT_AVX512(Name) \
3499 FMA3_SCALAR_PAIR(Name, Zr_Int) \
3500 FMA3_SCALAR_PAIR(Name, Zm_Int)
3501
3502 // Define the array that holds FMA opcodes in groups
3503 // of 3 opcodes(132, 213, 231) in each group.
3504 static const uint16_t RegularOpcodeGroups[][3] = {
3505 FMA3_TABLE_ALL(VFMADD)
3506 FMA3_TABLE_ALL(VFMSUB)
3507 FMA3_TABLE_ALL(VFNMADD)
3508 FMA3_TABLE_ALL(VFNMSUB)
3509 FMA3_TABLE_PACKED(VFMADDSUB)
3510 FMA3_TABLE_PACKED(VFMSUBADD)
3511
3512 // AVX-512 instructions
3513 FMA3_TABLE_ALL_AVX512(VFMADD)
3514 FMA3_TABLE_ALL_AVX512(VFMSUB)
3515 FMA3_TABLE_ALL_AVX512(VFNMADD)
3516 FMA3_TABLE_ALL_AVX512(VFNMSUB)
3517 FMA3_TABLE_PACKED_AVX512(VFMADDSUB)
3518 FMA3_TABLE_PACKED_AVX512(VFMSUBADD)
3519 };
3520
3521 // Define the array that holds FMA*_Int opcodes in groups
3522 // of 3 opcodes(132, 213, 231) in each group.
3523 static const uint16_t IntrinOpcodeGroups[][3] = {
3524 FMA3_TABLE_SCALAR_INT(VFMADD)
3525 FMA3_TABLE_SCALAR_INT(VFMSUB)
3526 FMA3_TABLE_SCALAR_INT(VFNMADD)
3527 FMA3_TABLE_SCALAR_INT(VFNMSUB)
3528
3529 // AVX-512 instructions
3530 FMA3_TABLE_SCALAR_INT_AVX512(VFMADD)
3531 FMA3_TABLE_SCALAR_INT_AVX512(VFMSUB)
3532 FMA3_TABLE_SCALAR_INT_AVX512(VFNMADD)
3533 FMA3_TABLE_SCALAR_INT_AVX512(VFNMSUB)
3534 };
3535
3536 #undef FMA3_ENTRY
3537 #undef FMA3_SCALAR_PAIR
3538 #undef FMA3_PACKED_PAIR
3539 #undef FMA3_PACKED_SIZES
3540 #undef FMA3_TABLE_ALL
3541 #undef FMA3_TABLE_PACKED
3542 #undef FMA3_TABLE_SCALAR_INT
3543 #undef FMA3_SCALAR_PAIR_AVX512
3544 #undef FMA3_PACKED_SIZES_AVX512
3545 #undef FMA3_TABLE_ALL_AVX512
3546 #undef FMA3_TABLE_PACKED_AVX512
3547 #undef FMA3_TABLE_SCALAR_INT_AVX512
3548
3549 const unsigned Form132Index = 0;
3550 const unsigned Form213Index = 1;
3551 const unsigned Form231Index = 2;
3552 const unsigned FormsNum = 3;
3553
3554 size_t GroupsNum;
3555 const uint16_t (*OpcodeGroups)[3];
3556 if (IsIntrinOpcode) {
3557 GroupsNum = array_lengthof(IntrinOpcodeGroups);
3558 OpcodeGroups = IntrinOpcodeGroups;
3559 } else {
3560 GroupsNum = array_lengthof(RegularOpcodeGroups);
3561 OpcodeGroups = RegularOpcodeGroups;
3562 }
3563
3564 const uint16_t *FoundOpcodesGroup = nullptr;
3565 size_t FormIndex;
3566
3567 // Look for the input opcode in the corresponding opcodes table.
3568 for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
3569 ++GroupIndex) {
3570 for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
3571 if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
3572 FoundOpcodesGroup = OpcodeGroups[GroupIndex];
3573 break;
3574 }
3575 }
3576 }
3577
3578 // The input opcode does not match with any of the opcodes from the tables.
3579 // The unsupported FMA opcode must be added to one of the two opcode groups
3580 // defined above.
3581 assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
3084 unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
3085 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
3086 const X86InstrFMA3Group &FMA3Group) const {
3087
3088 unsigned Opc = MI.getOpcode();
35823089
35833090 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
35843091 if (SrcOpIdx1 > SrcOpIdx2)
35903097 // not implemented yet. So, just return 0 in that case.
35913098 // When such analysis are available this place will be the right place for
35923099 // calling it.
3593 if (IsIntrinOpcode && SrcOpIdx1 == 1)
3100 if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
35943101 return 0;
35953102
3103 unsigned FMAOp1 = 1, FMAOp2 = 2, FMAOp3 = 3;
3104 if (FMA3Group.isKMasked()) {
3105 // The k-mask operand cannot be commuted.
3106 if (SrcOpIdx1 == 2)
3107 return 0;
3108
3109 // For k-zero-masked operations it is Ok to commute the first vector
3110 // operand.
3111 // For regular k-masked operations a conservative choice is done as the
3112 // elements of the first vector operand, for which the corresponding bit
3113 // in the k-mask operand is set to 0, are copied to the result of FMA.
3114 // TODO/FIXME: The commute still may be legal if it is known that the
3115 // k-mask operand is set to either all ones or all zeroes.
3116 // It is also Ok to commute the 1st operand if all users of MI use only
3117 // the elements enabled by the k-mask operand. For example,
3118 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
3119 // : v1[i];
3120 // VMOVAPSZmrk , k, v4; // this is the ONLY user of v4 ->
3121 // // Ok, to commute v1 in FMADD213PSZrk.
3122 if (FMA3Group.isKMergeMasked() && SrcOpIdx1 == FMAOp1)
3123 return 0;
3124 FMAOp2++;
3125 FMAOp3++;
3126 }
3127
35963128 unsigned Case;
3597 if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
3129 if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp2)
35983130 Case = 0;
3599 else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
3131 else if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp3)
36003132 Case = 1;
3601 else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
3133 else if (SrcOpIdx1 == FMAOp2 && SrcOpIdx2 == FMAOp3)
36023134 Case = 2;
36033135 else
36043136 return 0;
36063138 // Define the FMA forms mapping array that helps to map input FMA form
36073139 // to output FMA form to preserve the operation semantics after
36083140 // commuting the operands.
3141 const unsigned Form132Index = 0;
3142 const unsigned Form213Index = 1;
3143 const unsigned Form231Index = 2;
36093144 static const unsigned FormMapping[][3] = {
36103145 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
36113146 // FMA132 A, C, b; ==> FMA231 C, A, b;
36243159 { Form213Index, Form132Index, Form231Index }
36253160 };
36263161
3162 unsigned FMAForms[3];
3163 if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
3164 FMAForms[0] = FMA3Group.getReg132Opcode();
3165 FMAForms[1] = FMA3Group.getReg213Opcode();
3166 FMAForms[2] = FMA3Group.getReg231Opcode();
3167 } else {
3168 FMAForms[0] = FMA3Group.getMem132Opcode();
3169 FMAForms[1] = FMA3Group.getMem213Opcode();
3170 FMAForms[2] = FMA3Group.getMem231Opcode();
3171 }
3172 unsigned FormIndex;
3173 for (FormIndex = 0; FormIndex < 3; FormIndex++)
3174 if (Opc == FMAForms[FormIndex])
3175 break;
3176
36273177 // Everything is ready, just adjust the FMA opcode and return it.
36283178 FormIndex = FormMapping[Case][FormIndex];
3629 return FoundOpcodesGroup[FormIndex];
3179 return FMAForms[FormIndex];
36303180 }
36313181
36323182 MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
38513401 OpIdx1, OpIdx2);
38523402 }
38533403 default:
3854 bool IsIntrinOpcode;
3855 if (isFMA3(MI.getOpcode(), IsIntrinOpcode)) {
3856 unsigned Opc = getFMA3OpcodeToCommuteOperands(MI.getOpcode(),
3857 IsIntrinOpcode,
3858 OpIdx1, OpIdx2);
3404 const X86InstrFMA3Group *FMA3Group =
3405 X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
3406 if (FMA3Group) {
3407 unsigned Opc =
3408 getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
38593409 if (Opc == 0)
38603410 return nullptr;
38613411 auto &WorkingMI = cloneIfNew(MI);
38683418 }
38693419 }
38703420
3871 bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
3872 bool IsIntrinOpcode,
3873 unsigned &SrcOpIdx1,
3874 unsigned &SrcOpIdx2) const {
3875
3876 unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
3421 bool X86InstrInfo::findFMA3CommutedOpIndices(
3422 const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
3423 const X86InstrFMA3Group &FMA3Group) const {
3424 unsigned FirstCommutableVecOp = 1;
3425 unsigned LastCommutableVecOp = 3;
3426 unsigned KMaskOp = 0;
3427 if (FMA3Group.isKMasked()) {
3428 // The k-mask operand has index = 2 for masked and zero-masked operations.
3429 KMaskOp = 2;
3430
3431 // The operand with index = 1 is used as a source for those elements for
3432 // which the corresponding bit in the k-mask is set to 0.
3433 if (FMA3Group.isKMergeMasked())
3434 FirstCommutableVecOp = 3;
3435
3436 LastCommutableVecOp++;
3437 }
3438
3439 if (isMem(MI, LastCommutableVecOp))
3440 LastCommutableVecOp--;
38773441
38783442 // Only the first RegOpsNum operands are commutable.
38793443 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
38803444 // that the operand is not specified/fixed.
38813445 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
3882 (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
3446 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
3447 SrcOpIdx1 == KMaskOp))
38833448 return false;
38843449 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
3885 (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
3450 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
3451 SrcOpIdx2 == KMaskOp))
38863452 return false;
38873453
38883454 // Look for two different register operands assumed to be commutable
38973463 if (SrcOpIdx1 == SrcOpIdx2)
38983464 // Both of operands are not fixed. By default set one of commutable
38993465 // operands to the last register operand of the instruction.
3900 CommutableOpIdx2 = RegOpsNum;
3466 CommutableOpIdx2 = LastCommutableVecOp;
39013467 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
39023468 // Only one of operands is not fixed.
39033469 CommutableOpIdx2 = SrcOpIdx1;
39053471 // CommutableOpIdx2 is well defined now. Let's choose another commutable
39063472 // operand and assign its index to CommutableOpIdx1.
39073473 unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
3908 for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
3474 for (CommutableOpIdx1 = LastCommutableVecOp;
3475 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
3476 // Just ignore and skip the k-mask operand.
3477 if (CommutableOpIdx1 == KMaskOp)
3478 continue;
3479
39093480 // The commuted operands must have different registers.
39103481 // Otherwise, the commute transformation does not change anything and
39113482 // is useless then.
39143485 }
39153486
39163487 // No appropriate commutable operands were found.
3917 if (CommutableOpIdx1 == 0)
3488 if (CommutableOpIdx1 < FirstCommutableVecOp)
39183489 return false;
39193490
39203491 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
39263497
39273498 // Check if we can adjust the opcode to preserve the semantics when
39283499 // commute the register operands.
3929 return getFMA3OpcodeToCommuteOperands(MI.getOpcode(), IsIntrinOpcode,
3930 SrcOpIdx1, SrcOpIdx2) != 0;
3500 return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
39313501 }
39323502
39333503 bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
39543524 return false;
39553525 }
39563526 default:
3957 bool IsIntrinOpcode;
3958 if (isFMA3(MI.getOpcode(), IsIntrinOpcode))
3959 return findFMA3CommutedOpIndices(MI, IsIntrinOpcode,
3960 SrcOpIdx1, SrcOpIdx2);
3527 const X86InstrFMA3Group *FMA3Group =
3528 X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
3529 if (FMA3Group)
3530 return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
39613531 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
39623532 }
39633533 return false;
1414 #define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
1515
1616 #include "MCTargetDesc/X86BaseInfo.h"
17 #include "X86InstrFMA3Info.h"
1718 #include "X86RegisterInfo.h"
1819 #include "llvm/ADT/DenseMap.h"
1920 #include "llvm/Target/TargetInstrInfo.h"
264265 unsigned &SrcOpIdx2) const override;
265266
266267 /// Returns true if the routine could find two commutable operands
267 /// in the given FMA instruction. Otherwise, returns false.
268 /// in the given FMA instruction \p MI. Otherwise, returns false.
268269 ///
269270 /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
270271 /// The output indices of the commuted operands are returned in these
273274 /// value 'CommuteAnyOperandIndex' which means that the corresponding
274275 /// operand index is not set and this method is free to pick any of
275276 /// available commutable operands.
277 /// The parameter \p FMA3Group keeps the reference to the group of relative
278 /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
276279 ///
277280 /// For example, calling this method this way:
278281 /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
279 /// findFMA3CommutedOpIndices(MI, Idx1, Idx2);
282 /// findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
280283 /// can be interpreted as a query asking if the operand #1 can be swapped
281284 /// with any other available operand (e.g. operand #2, operand #3, etc.).
282285 ///
285288 /// FMA213 #1, #2, #3
286289 /// results into instruction with adjusted opcode:
287290 /// FMA231 #3, #2, #1
288 bool findFMA3CommutedOpIndices(MachineInstr &MI, bool IsIntrinOpcode,
291 bool findFMA3CommutedOpIndices(const MachineInstr &MI,
289292 unsigned &SrcOpIdx1,
290 unsigned &SrcOpIdx2) const;
293 unsigned &SrcOpIdx2,
294 const X86InstrFMA3Group &FMA3Group) const;
295
296 /// Returns an adjusted FMA opcode that must be used in FMA instruction that
297 /// performs the same computations as the given \p MI but which has the
298 /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
299 /// It may return 0 if it is unsafe to commute the operands.
300 /// Note that a machine instruction (instead of its opcode) is passed as the
301 /// first parameter to make it possible to analyze the instruction's uses and
302 /// commute the first operand of FMA even when it seems unsafe when you look
303 /// at the opcode. For example, it is Ok to commute the first operand of
304 /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
305 ///
306 /// The returned FMA opcode may differ from the opcode in the given \p MI.
307 /// For example, commuting the operands #1 and #3 in the following FMA
308 /// FMA213 #1, #2, #3
309 /// results into instruction with adjusted opcode:
310 /// FMA231 #3, #2, #1
311 unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
312 unsigned SrcOpIdx1,
313 unsigned SrcOpIdx2,
314 const X86InstrFMA3Group &FMA3Group) const;
291315
292316 // Branch analysis.
293317 bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
309309 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
310310 ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne:
311311 ; CHECK: ## BB#0:
312 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm0, %zmm1
313 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
312 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
314313 ; CHECK-NEXT: retq
315314 %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
316315 ret <16 x float> %res
319318 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
320319 ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn:
321320 ; CHECK: ## BB#0:
322 ; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm0, %zmm1
323 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
321 ; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0
324322 ; CHECK-NEXT: retq
325323 %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
326324 ret <16 x float> %res
329327 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
330328 ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp:
331329 ; CHECK: ## BB#0:
332 ; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
333 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
330 ; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
334331 ; CHECK-NEXT: retq
335332 %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
336333 ret <16 x float> %res
339336 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
340337 ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz:
341338 ; CHECK: ## BB#0:
342 ; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm0, %zmm1
343 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
339 ; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0
344340 ; CHECK-NEXT: retq
345341 %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
346342 ret <16 x float> %res
442438 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
443439 ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne:
444440 ; CHECK: ## BB#0:
445 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm0, %zmm1
446 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
441 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
447442 ; CHECK-NEXT: retq
448443 %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
449444 ret <8 x double> %res
452447 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
453448 ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn:
454449 ; CHECK: ## BB#0:
455 ; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm0, %zmm1
456 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
450 ; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0
457451 ; CHECK-NEXT: retq
458452 %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
459453 ret <8 x double> %res
462456 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
463457 ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp:
464458 ; CHECK: ## BB#0:
465 ; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm0, %zmm1
466 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
459 ; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0
467460 ; CHECK-NEXT: retq
468461 %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
469462 ret <8 x double> %res
472465 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
473466 ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz:
474467 ; CHECK: ## BB#0:
475 ; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm0, %zmm1
476 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
468 ; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0
477469 ; CHECK-NEXT: retq
478470 %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
479471 ret <8 x double> %res
640632 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
641633 ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne:
642634 ; CHECK: ## BB#0:
643 ; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm0, %zmm1
644 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
635 ; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
645636 ; CHECK-NEXT: retq
646637 %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
647638 ret <8 x double> %res
650641 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
651642 ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn:
652643 ; CHECK: ## BB#0:
653 ; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm0, %zmm1
654 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
644 ; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0
655645 ; CHECK-NEXT: retq
656646 %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
657647 ret <8 x double> %res
660650 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
661651 ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp:
662652 ; CHECK: ## BB#0:
663 ; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm0, %zmm1
664 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
653 ; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0
665654 ; CHECK-NEXT: retq
666655 %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
667656 ret <8 x double> %res
670659 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
671660 ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz:
672661 ; CHECK: ## BB#0:
673 ; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm0, %zmm1
674 ; CHECK-NEXT: vmovapd %zmm1, %zmm0
662 ; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0
675663 ; CHECK-NEXT: retq
676664 %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
677665 ret <8 x double> %res
100100 define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
101101 ; ALL-LABEL: test231_br:
102102 ; ALL: ## BB#0:
103 ; ALL-NEXT: vfmadd231ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
104 ; ALL-NEXT: vmovaps %zmm1, %zmm0
103 ; ALL-NEXT: vfmadd132ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
105104 ; ALL-NEXT: retq
106105 %b1 = fmul <16 x float> %a1,
107106 %b2 = fadd <16 x float> %b1, %a2
111110 define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
112111 ; ALL-LABEL: test213_br:
113112 ; ALL: ## BB#0:
114 ; ALL-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm0, %zmm1
115 ; ALL-NEXT: vmovaps %zmm1, %zmm0
113 ; ALL-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
116114 ; ALL-NEXT: retq
117115 %b1 = fmul <16 x float> %a1, %a2
118116 %b2 = fadd <16 x float> %b1,
174172 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
175173 ; KNL-NEXT: vpslld $31, %zmm2, %zmm2
176174 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
177 ; KNL-NEXT: vmovups (%rdi), %zmm2
178 ; KNL-NEXT: vfmadd132ps %zmm0, %zmm2, %zmm1 {%k1}
175 ; KNL-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
179176 ; KNL-NEXT: vmovaps %zmm1, %zmm0
180177 ; KNL-NEXT: retq
181178 ;
183180 ; SKX: ## BB#0:
184181 ; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
185182 ; SKX-NEXT: vpmovb2m %xmm2, %k1
186 ; SKX-NEXT: vmovups (%rdi), %zmm2
187 ; SKX-NEXT: vfmadd132ps %zmm0, %zmm2, %zmm1 {%k1}
183 ; SKX-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
188184 ; SKX-NEXT: vmovaps %zmm1, %zmm0
189185 ; SKX-NEXT: retq
190186 %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1
18111811 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk:
18121812 ; CHECK: ## BB#0:
18131813 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
1814 ; CHECK-NEXT: vmovaps (%rdi), %xmm2 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0x17]
1815 ; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
1814 ; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
18161815 ; CHECK-NEXT: retq ## encoding: [0xc3]
18171816 %a2 = load <4 x float>, <4 x float>* %ptr_a2
18181817 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
18231822 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka:
18241823 ; CHECK: ## BB#0:
18251824 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
1826 ; CHECK-NEXT: vmovups (%rdi), %xmm2 ## encoding: [0x62,0xf1,0x7c,0x08,0x10,0x17]
1827 ; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
1825 ; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
18281826 ; CHECK-NEXT: retq ## encoding: [0xc3]
18291827 %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
18301828 %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
18841882 define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
18851883 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz:
18861884 ; CHECK: ## BB#0:
1887 ; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x18,0xa8,0x0f]
1888 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
1885 ; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
18891886 ; CHECK-NEXT: retq ## encoding: [0xc3]
18901887 %q = load float, float* %ptr_a2
18911888 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
18991896 define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
19001897 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza:
19011898 ; CHECK: ## BB#0:
1902 ; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x18,0xa8,0x0f]
1903 ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x28,0xc1]
1899 ; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
19041900 ; CHECK-NEXT: retq ## encoding: [0xc3]
19051901 %q = load float, float* %ptr_a2, align 4
19061902 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
19341930 ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk:
19351931 ; CHECK: ## BB#0:
19361932 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
1937 ; CHECK-NEXT: vmovapd (%rdi), %xmm2 ## encoding: [0x62,0xf1,0xfd,0x08,0x28,0x17]
1938 ; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
1933 ; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
19391934 ; CHECK-NEXT: retq ## encoding: [0xc3]
19401935 %a2 = load <2 x double>, <2 x double>* %ptr_a2
19411936 %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
19751970 ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk:
19761971 ; CHECK: ## BB#0:
19771972 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
1978 ; CHECK-NEXT: vmovapd (%rdi), %ymm2 ## encoding: [0x62,0xf1,0xfd,0x28,0x28,0x17]
1979 ; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
1973 ; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
19801974 ; CHECK-NEXT: retq ## encoding: [0xc3]
19811975 %a2 = load <4 x double>, <4 x double>* %ptr_a2
19821976 %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
6060 ; CHECK-LABEL: test5:
6161 ; CHECK: # BB#0: # %entry
6262 ; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm2, %zmm2
63 ; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
64 ; CHECK-NEXT: vmovaps %zmm1, %zmm0
63 ; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
6564 ; CHECK-NEXT: retq
6665 entry:
6766 %sub.i = fsub <16 x float> , %c
7271 define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
7372 ; CHECK-LABEL: test6:
7473 ; CHECK: # BB#0: # %entry
75 ; CHECK-NEXT: vfnmsub213ps {ru-sae}, %zmm2, %zmm0, %zmm1
76 ; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm1, %zmm0
74 ; CHECK-NEXT: vfnmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0
75 ; CHECK-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0
7776 ; CHECK-NEXT: retq
7877 entry:
7978 %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 2) #2