llvm.org GIT mirror llvm / b9583a3
Add LiveRangeShrink pass to shrink live range within BB. Summary: LiveRangeShrink pass moves instruction right after the definition with the same BB if the instruction and its operands all have more than one use. This pass is inexpensive and guarantees optimal live-range within BB. Reviewers: davidxl, wmi, hfinkel, MatzeB, andreadb Reviewed By: MatzeB, andreadb Subscribers: hiraditya, jyknight, sanjoy, skatkov, gberry, jholewinski, qcolombet, javed.absar, krytarowski, atrick, spatel, RKSimon, andreadb, MatzeB, mehdi_amini, mgorny, efriedma, davide, dberlin, llvm-commits Differential Revision: https://reviews.llvm.org/D32563 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304371 91177308-0d34-0410-b5e6-96231b3b80d8 Dehao Chen 2 years ago
46 changed file(s) with 1627 addition(s) and 1351 deletion(s). Raw diff Collapse all Expand all
132132 // instruction and update the MachineFunctionInfo with that information.
133133 extern char &ShrinkWrapID;
134134
135 /// LiveRangeShrink pass. Move instruction close to its definition to shrink
136 /// the definition's live range.
137 extern char &LiveRangeShrinkID;
138
135139 /// Greedy register allocator.
136140 extern char &RAGreedyID;
137141
187187 void initializeLiveDebugValuesPass(PassRegistry&);
188188 void initializeLiveDebugVariablesPass(PassRegistry&);
189189 void initializeLiveIntervalsPass(PassRegistry&);
190 void initializeLiveRangeShrinkPass(PassRegistry&);
190191 void initializeLiveRegMatrixPass(PassRegistry&);
191192 void initializeLiveStacksPass(PassRegistry&);
192193 void initializeLiveVariablesPass(PassRegistry&);
4848 LivePhysRegs.cpp
4949 LiveRangeCalc.cpp
5050 LiveRangeEdit.cpp
51 LiveRangeShrink.cpp
5152 LiveRegMatrix.cpp
5253 LiveRegUnits.cpp
5354 LiveStackAnalysis.cpp
4242 initializeLiveDebugValuesPass(Registry);
4343 initializeLiveDebugVariablesPass(Registry);
4444 initializeLiveIntervalsPass(Registry);
45 initializeLiveRangeShrinkPass(Registry);
4546 initializeLiveStacksPass(Registry);
4647 initializeLiveVariablesPass(Registry);
4748 initializeLocalStackSlotPassPass(Registry);
0 //===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 ///===---------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This pass moves instructions close to the definition of its operands to
11 /// shrink live range of the def instruction. The code motion is limited within
12 /// the basic block. The moved instruction should have 1 def, and more than one
13 /// uses, all of which are the only use of the def.
14 ///
15 ///===---------------------------------------------------------------------===//
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/MachineRegisterInfo.h"
19 #include "llvm/CodeGen/Passes.h"
20 #include "llvm/Support/Debug.h"
21
22 #define DEBUG_TYPE "lrshrink"
23
24 STATISTIC(NumInstrsHoistedToShrinkLiveRange,
25 "Number of insructions hoisted to shrink live range.");
26
27 using namespace llvm;
28
29 namespace {
30 class LiveRangeShrink : public MachineFunctionPass {
31 public:
32 static char ID;
33
34 LiveRangeShrink() : MachineFunctionPass(ID) {
35 initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry());
36 }
37
38 void getAnalysisUsage(AnalysisUsage &AU) const override {
39 AU.setPreservesCFG();
40 MachineFunctionPass::getAnalysisUsage(AU);
41 }
42
43 StringRef getPassName() const override { return "Live Range Shrink"; }
44
45 bool runOnMachineFunction(MachineFunction &MF) override;
46 };
47 } // End anonymous namespace.
48
49 char LiveRangeShrink::ID = 0;
50 char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID;
51
52 INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false,
53 false)
54 namespace {
55 typedef DenseMap InstOrderMap;
56
57 /// Returns \p New if it's dominated by \p Old, otherwise return \p Old.
58 /// \p M maintains a map from instruction to its dominating order that satisfies
59 /// M[A] > M[B] guarantees that A is dominated by B.
60 /// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return
61 /// \p New.
62 MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old,
63 const InstOrderMap &M) {
64 auto NewIter = M.find(&New);
65 if (NewIter == M.end())
66 return Old;
67 if (Old == nullptr)
68 return &New;
69 unsigned OrderOld = M.find(Old)->second;
70 unsigned OrderNew = NewIter->second;
71 if (OrderOld != OrderNew)
72 return OrderOld < OrderNew ? &New : Old;
73 // OrderOld == OrderNew, we need to iterate down from Old to see if it
74 // can reach New, if yes, New is dominated by Old.
75 for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew;
76 I = I->getNextNode())
77 if (I == &New)
78 return &New;
79 return Old;
80 }
81
82 /// Builds Instruction to its dominating order number map \p M by traversing
83 /// from instruction \p Start.
84 void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) {
85 M.clear();
86 unsigned i = 0;
87 for (MachineInstr &I : make_range(Start, Start->getParent()->end()))
88 M[&I] = i++;
89 }
90 } // end anonymous namespace
91
92 bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
93 if (skipFunction(*MF.getFunction()))
94 return false;
95
96 MachineRegisterInfo &MRI = MF.getRegInfo();
97
98 DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
99
100 InstOrderMap IOM;
101 // Map from register to instruction order (value of IOM) where the
102 // register is used last. When moving instructions up, we need to
103 // make sure all its defs (including dead def) will not cross its
104 // last use when moving up.
105 DenseMap> UseMap;
106
107 for (MachineBasicBlock &MBB : MF) {
108 if (MBB.empty())
109 continue;
110 bool SawStore = false;
111 BuildInstOrderMap(MBB.begin(), IOM);
112 UseMap.clear();
113
114 for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
115 MachineInstr &MI = *Next;
116 ++Next;
117 if (MI.isPHI() || MI.isDebugValue())
118 continue;
119 if (MI.mayStore())
120 SawStore = true;
121
122 unsigned CurrentOrder = IOM[&MI];
123 unsigned Barrier = 0;
124 MachineInstr *BarrierMI = nullptr;
125 for (const MachineOperand &MO : MI.operands()) {
126 if (!MO.isReg() || MO.isDebug())
127 continue;
128 if (MO.isUse())
129 UseMap[MO.getReg()] = std::make_pair(CurrentOrder, &MI);
130 else if (MO.isDead() && UseMap.count(MO.getReg()))
131 // Barrier is the last instruction where MO get used. MI should not
132 // be moved above Barrier.
133 if (Barrier < UseMap[MO.getReg()].first) {
134 Barrier = UseMap[MO.getReg()].first;
135 BarrierMI = UseMap[MO.getReg()].second;
136 }
137 }
138
139 if (!MI.isSafeToMove(nullptr, SawStore)) {
140 // If MI has side effects, it should become a barrier for code motion.
141 // IOM is rebuild from the next instruction to prevent later
142 // instructions from being moved before this MI.
143 if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
144 BuildInstOrderMap(Next, IOM);
145 SawStore = false;
146 }
147 continue;
148 }
149
150 const MachineOperand *DefMO = nullptr;
151 MachineInstr *Insert = nullptr;
152
153 // Number of live-ranges that will be shortened. We do not count
154 // live-ranges that are defined by a COPY as it could be coalesced later.
155 unsigned NumEligibleUse = 0;
156
157 for (const MachineOperand &MO : MI.operands()) {
158 if (!MO.isReg() || MO.isDead() || MO.isDebug())
159 continue;
160 unsigned Reg = MO.getReg();
161 // Do not move the instruction if it def/uses a physical register,
162 // unless it is a constant physical register or a noreg.
163 if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
164 if (!Reg || MRI.isConstantPhysReg(Reg))
165 continue;
166 Insert = nullptr;
167 break;
168 }
169 if (MO.isDef()) {
170 // Do not move if there is more than one def.
171 if (DefMO) {
172 Insert = nullptr;
173 break;
174 }
175 DefMO = &MO;
176 } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg) && DefMO &&
177 MRI.getRegClass(DefMO->getReg()) ==
178 MRI.getRegClass(MO.getReg())) {
179 // The heuristic does not handle different register classes yet
180 // (registers of different sizes, looser/tighter constraints). This
181 // is because it needs more accurate model to handle register
182 // pressure correctly.
183 MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
184 if (!DefInstr.isCopy())
185 NumEligibleUse++;
186 Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
187 } else {
188 Insert = nullptr;
189 break;
190 }
191 }
192
193 // If Barrier equals IOM[I], traverse forward to find if BarrierMI is
194 // after Insert, if yes, then we should not hoist.
195 for (MachineInstr *I = Insert; I && IOM[I] == Barrier;
196 I = I->getNextNode())
197 if (I == BarrierMI) {
198 Insert = nullptr;
199 break;
200 }
201 // Move the instruction when # of shrunk live range > 1.
202 if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
203 MachineBasicBlock::iterator I = std::next(Insert->getIterator());
204 // Skip all the PHI and debug instructions.
205 while (I != MBB.end() && (I->isPHI() || I->isDebugValue()))
206 I = std::next(I);
207 if (I == MI.getIterator())
208 continue;
209
210 // Update the dominator order to be the same as the insertion point.
211 // We do this to maintain a non-decreasing order without need to update
212 // all instruction orders after the insertion point.
213 unsigned NewOrder = IOM[&*I];
214 IOM[&MI] = NewOrder;
215 NumInstrsHoistedToShrinkLiveRange++;
216
217 // Find MI's debug value following MI.
218 MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
219 if (MI.getOperand(0).isReg())
220 for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
221 EndIter->getOperand(0).isReg() &&
222 EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg();
223 ++EndIter, ++Next)
224 IOM[&*EndIter] = NewOrder;
225 MBB.splice(I, &MBB, MI.getIterator(), EndIter);
226 }
227 }
228 }
229 return false;
230 }
432432
433433 void X86PassConfig::addPreRegAlloc() {
434434 if (getOptLevel() != CodeGenOpt::None) {
435 addPass(&LiveRangeShrinkID);
435436 addPass(createX86FixupSetCC());
436437 addPass(createX86OptimizeLEAs());
437438 addPass(createX86CallFrameOptimization());
1212
1313 ; CHECK: mulss
1414 ; CHECK: mulss
15 ; CHECK: mulss
15 ; CHECK: addss
1616 ; CHECK: mulss
1717 ; CHECK: addss
18 ; CHECK: addss
18 ; CHECK: mulss
1919 ; CHECK: addss
2020 ; CHECK: ret
2121 }
134134 define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
135135 ; SSE2-LABEL: avg_v32i8:
136136 ; SSE2: # BB#0:
137 ; SSE2-NEXT: movdqa (%rdi), %xmm8
138 ; SSE2-NEXT: movdqa 16(%rdi), %xmm11
137 ; SSE2-NEXT: movdqa (%rdi), %xmm3
138 ; SSE2-NEXT: movdqa 16(%rdi), %xmm8
139139 ; SSE2-NEXT: movdqa (%rsi), %xmm0
140140 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
141141 ; SSE2-NEXT: pxor %xmm4, %xmm4
142 ; SSE2-NEXT: movdqa %xmm3, %xmm5
143 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
144 ; SSE2-NEXT: movdqa %xmm5, %xmm6
145 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
146 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
147 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
148 ; SSE2-NEXT: movdqa %xmm3, %xmm12
149 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
150 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
151 ; SSE2-NEXT: movdqa %xmm8, %xmm7
152 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
153 ; SSE2-NEXT: movdqa %xmm7, %xmm11
154 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
155 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
156 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
142157 ; SSE2-NEXT: movdqa %xmm8, %xmm10
143 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
144 ; SSE2-NEXT: movdqa %xmm10, %xmm2
145 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
146 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
147 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
148 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
149 ; SSE2-NEXT: movdqa %xmm8, %xmm12
150 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
158 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
151159 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
152 ; SSE2-NEXT: movdqa %xmm11, %xmm15
153 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
154 ; SSE2-NEXT: movdqa %xmm15, %xmm14
155 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
156 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
157 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
158 ; SSE2-NEXT: movdqa %xmm11, %xmm9
160 ; SSE2-NEXT: movdqa %xmm0, %xmm2
161 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
162 ; SSE2-NEXT: movdqa %xmm2, %xmm9
159163 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
160 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
161 ; SSE2-NEXT: movdqa %xmm0, %xmm3
164 ; SSE2-NEXT: paddd %xmm6, %xmm9
165 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
166 ; SSE2-NEXT: paddd %xmm5, %xmm2
167 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
168 ; SSE2-NEXT: movdqa %xmm0, %xmm5
169 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
170 ; SSE2-NEXT: paddd %xmm12, %xmm5
171 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
172 ; SSE2-NEXT: paddd %xmm3, %xmm0
173 ; SSE2-NEXT: movdqa %xmm1, %xmm3
162174 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
163 ; SSE2-NEXT: movdqa %xmm3, %xmm7
175 ; SSE2-NEXT: movdqa %xmm3, %xmm6
176 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
177 ; SSE2-NEXT: paddd %xmm11, %xmm6
178 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
179 ; SSE2-NEXT: paddd %xmm7, %xmm3
180 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
181 ; SSE2-NEXT: movdqa %xmm1, %xmm7
164182 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
165 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
166 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
167 ; SSE2-NEXT: movdqa %xmm0, %xmm6
168 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
169 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
170 ; SSE2-NEXT: movdqa %xmm1, %xmm2
171 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
172 ; SSE2-NEXT: movdqa %xmm2, %xmm5
173 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
174 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
175 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
176 ; SSE2-NEXT: movdqa %xmm1, %xmm13
177 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
183 ; SSE2-NEXT: paddd %xmm10, %xmm7
178184 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
179 ; SSE2-NEXT: paddd %xmm11, %xmm1
180 ; SSE2-NEXT: paddd %xmm9, %xmm13
181 ; SSE2-NEXT: paddd %xmm15, %xmm2
182 ; SSE2-NEXT: paddd %xmm14, %xmm5
183 ; SSE2-NEXT: paddd %xmm8, %xmm0
184 ; SSE2-NEXT: paddd %xmm12, %xmm6
185 ; SSE2-NEXT: paddd %xmm10, %xmm3
186 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
185 ; SSE2-NEXT: paddd %xmm8, %xmm1
187186 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
187 ; SSE2-NEXT: paddd %xmm4, %xmm9
188 ; SSE2-NEXT: paddd %xmm4, %xmm2
189 ; SSE2-NEXT: paddd %xmm4, %xmm5
190 ; SSE2-NEXT: paddd %xmm4, %xmm0
191 ; SSE2-NEXT: paddd %xmm4, %xmm6
192 ; SSE2-NEXT: paddd %xmm4, %xmm3
188193 ; SSE2-NEXT: paddd %xmm4, %xmm7
189 ; SSE2-NEXT: paddd %xmm4, %xmm3
190 ; SSE2-NEXT: paddd %xmm4, %xmm6
191 ; SSE2-NEXT: paddd %xmm4, %xmm0
192 ; SSE2-NEXT: paddd %xmm4, %xmm5
193 ; SSE2-NEXT: paddd %xmm4, %xmm2
194 ; SSE2-NEXT: paddd %xmm4, %xmm13
195194 ; SSE2-NEXT: paddd %xmm4, %xmm1
195 ; SSE2-NEXT: psrld $1, %xmm1
196 ; SSE2-NEXT: psrld $1, %xmm7
196197 ; SSE2-NEXT: psrld $1, %xmm3
197 ; SSE2-NEXT: psrld $1, %xmm7
198 ; SSE2-NEXT: psrld $1, %xmm6
199 ; SSE2-NEXT: psrld $1, %xmm0
200 ; SSE2-NEXT: psrld $1, %xmm5
201 ; SSE2-NEXT: psrld $1, %xmm2
202 ; SSE2-NEXT: psrld $1, %xmm9
198203 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
204 ; SSE2-NEXT: pand %xmm4, %xmm9
205 ; SSE2-NEXT: pand %xmm4, %xmm2
206 ; SSE2-NEXT: packuswb %xmm9, %xmm2
207 ; SSE2-NEXT: pand %xmm4, %xmm5
208 ; SSE2-NEXT: pand %xmm4, %xmm0
209 ; SSE2-NEXT: packuswb %xmm5, %xmm0
210 ; SSE2-NEXT: packuswb %xmm2, %xmm0
211 ; SSE2-NEXT: pand %xmm4, %xmm6
212 ; SSE2-NEXT: pand %xmm4, %xmm3
213 ; SSE2-NEXT: packuswb %xmm6, %xmm3
199214 ; SSE2-NEXT: pand %xmm4, %xmm7
200 ; SSE2-NEXT: pand %xmm4, %xmm3
201 ; SSE2-NEXT: packuswb %xmm7, %xmm3
202 ; SSE2-NEXT: psrld $1, %xmm0
203 ; SSE2-NEXT: psrld $1, %xmm6
204 ; SSE2-NEXT: pand %xmm4, %xmm6
205 ; SSE2-NEXT: pand %xmm4, %xmm0
206 ; SSE2-NEXT: packuswb %xmm6, %xmm0
207 ; SSE2-NEXT: packuswb %xmm3, %xmm0
208 ; SSE2-NEXT: psrld $1, %xmm2
209 ; SSE2-NEXT: psrld $1, %xmm5
210 ; SSE2-NEXT: pand %xmm4, %xmm5
211 ; SSE2-NEXT: pand %xmm4, %xmm2
212 ; SSE2-NEXT: packuswb %xmm5, %xmm2
213 ; SSE2-NEXT: psrld $1, %xmm1
214 ; SSE2-NEXT: psrld $1, %xmm13
215 ; SSE2-NEXT: pand %xmm4, %xmm13
216215 ; SSE2-NEXT: pand %xmm4, %xmm1
217 ; SSE2-NEXT: packuswb %xmm13, %xmm1
218 ; SSE2-NEXT: packuswb %xmm2, %xmm1
216 ; SSE2-NEXT: packuswb %xmm7, %xmm1
217 ; SSE2-NEXT: packuswb %xmm3, %xmm1
219218 ; SSE2-NEXT: movdqu %xmm1, (%rax)
220219 ; SSE2-NEXT: movdqu %xmm0, (%rax)
221220 ; SSE2-NEXT: retq
258257 define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
259258 ; SSE2-LABEL: avg_v64i8:
260259 ; SSE2: # BB#0:
261 ; SSE2-NEXT: subq $152, %rsp
262 ; SSE2-NEXT: .Lcfi0:
263 ; SSE2-NEXT: .cfi_def_cfa_offset 160
264 ; SSE2-NEXT: movdqa (%rdi), %xmm1
265 ; SSE2-NEXT: movdqa 16(%rdi), %xmm4
266 ; SSE2-NEXT: movdqa 32(%rdi), %xmm5
267 ; SSE2-NEXT: movdqa 48(%rdi), %xmm6
260 ; SSE2-NEXT: movdqa (%rdi), %xmm6
261 ; SSE2-NEXT: movdqa 16(%rdi), %xmm2
262 ; SSE2-NEXT: movdqa 32(%rdi), %xmm1
263 ; SSE2-NEXT: movdqa 48(%rdi), %xmm0
264 ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
265 ; SSE2-NEXT: movdqa (%rsi), %xmm5
266 ; SSE2-NEXT: movdqa 16(%rsi), %xmm13
267 ; SSE2-NEXT: movdqa 32(%rsi), %xmm11
268268 ; SSE2-NEXT: pxor %xmm0, %xmm0
269 ; SSE2-NEXT: movdqa %xmm1, %xmm3
270 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
271 ; SSE2-NEXT: movdqa %xmm3, %xmm2
272 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
273 ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
274 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
275 ; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
269 ; SSE2-NEXT: movdqa %xmm6, %xmm4
270 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
271 ; SSE2-NEXT: movdqa %xmm4, %xmm7
272 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
273 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
274 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
275 ; SSE2-NEXT: movdqa %xmm6, %xmm12
276 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
277 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
278 ; SSE2-NEXT: movdqa %xmm2, %xmm15
279 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
280 ; SSE2-NEXT: movdqa %xmm15, %xmm14
281 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
282 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
283 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
284 ; SSE2-NEXT: movdqa %xmm2, %xmm8
285 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
286 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
287 ; SSE2-NEXT: movdqa %xmm5, %xmm10
288 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
289 ; SSE2-NEXT: movdqa %xmm10, %xmm3
290 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
291 ; SSE2-NEXT: paddd %xmm7, %xmm3
292 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
293 ; SSE2-NEXT: movdqa %xmm1, %xmm7
294 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
295 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
296 ; SSE2-NEXT: paddd %xmm4, %xmm10
297 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
298 ; SSE2-NEXT: movdqa %xmm5, %xmm3
299 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
300 ; SSE2-NEXT: paddd %xmm12, %xmm3
301 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
302 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
303 ; SSE2-NEXT: paddd %xmm6, %xmm5
304 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
305 ; SSE2-NEXT: movdqa %xmm13, %xmm4
306 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
307 ; SSE2-NEXT: movdqa %xmm4, %xmm12
308 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
309 ; SSE2-NEXT: paddd %xmm14, %xmm12
310 ; SSE2-NEXT: movdqa %xmm7, %xmm5
311 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
312 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
276313 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
314 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
315 ; SSE2-NEXT: paddd %xmm15, %xmm4
316 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
317 ; SSE2-NEXT: movdqa %xmm13, %xmm15
318 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
319 ; SSE2-NEXT: paddd %xmm8, %xmm15
320 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
321 ; SSE2-NEXT: paddd %xmm2, %xmm13
322 ; SSE2-NEXT: movdqa %xmm11, %xmm6
323 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
324 ; SSE2-NEXT: movdqa %xmm6, %xmm9
325 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
326 ; SSE2-NEXT: paddd %xmm5, %xmm9
277327 ; SSE2-NEXT: movdqa %xmm1, %xmm2
278328 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
279 ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
280329 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
281 ; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
282 ; SSE2-NEXT: movdqa %xmm4, %xmm3
330 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
331 ; SSE2-NEXT: paddd %xmm7, %xmm6
332 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
333 ; SSE2-NEXT: movdqa %xmm11, %xmm14
334 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
335 ; SSE2-NEXT: paddd %xmm2, %xmm14
336 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
337 ; SSE2-NEXT: movdqa %xmm5, %xmm2
338 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
339 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
340 ; SSE2-NEXT: paddd %xmm1, %xmm11
341 ; SSE2-NEXT: movdqa %xmm2, %xmm1
342 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
343 ; SSE2-NEXT: movdqa 48(%rsi), %xmm7
344 ; SSE2-NEXT: movdqa %xmm7, %xmm3
283345 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
284 ; SSE2-NEXT: movdqa %xmm3, %xmm2
285 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
286 ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
346 ; SSE2-NEXT: movdqa %xmm3, %xmm8
347 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
348 ; SSE2-NEXT: paddd %xmm1, %xmm8
349 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
287350 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
288 ; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
289 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
290 ; SSE2-NEXT: movdqa %xmm4, %xmm2
291 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
292 ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
293 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
294 ; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
295 ; SSE2-NEXT: movdqa %xmm5, %xmm3
296 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
297 ; SSE2-NEXT: movdqa %xmm3, %xmm2
298 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
299 ; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
300 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
301 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
302 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
351 ; SSE2-NEXT: paddd %xmm2, %xmm3
303352 ; SSE2-NEXT: movdqa %xmm5, %xmm2
304 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
305 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
306 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
307 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
308 ; SSE2-NEXT: movdqa %xmm6, %xmm8
309 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
310 ; SSE2-NEXT: movdqa %xmm8, %xmm1
311 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
312 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
313 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
314 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
315 ; SSE2-NEXT: movdqa %xmm6, %xmm1
316 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
317 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
318 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
319 ; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
320 ; SSE2-NEXT: movdqa (%rsi), %xmm14
321 ; SSE2-NEXT: movdqa %xmm14, %xmm7
322 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
323 ; SSE2-NEXT: movdqa %xmm7, %xmm15
324 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
325 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
326 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
327 ; SSE2-NEXT: movdqa %xmm14, %xmm9
328 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
329 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
330 ; SSE2-NEXT: movdqa 16(%rsi), %xmm12
331 ; SSE2-NEXT: movdqa %xmm12, %xmm6
332 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
333 ; SSE2-NEXT: movdqa %xmm6, %xmm13
334 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
335 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
336 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
337 ; SSE2-NEXT: movdqa %xmm12, %xmm10
338 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
339 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
340 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
341 ; SSE2-NEXT: movdqa %xmm2, %xmm5
342 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
343 ; SSE2-NEXT: movdqa %xmm5, %xmm11
344 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
345 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
346353 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
347354 ; SSE2-NEXT: movdqa %xmm2, %xmm1
348355 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
356 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
357 ; SSE2-NEXT: movdqa %xmm7, %xmm5
358 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
359 ; SSE2-NEXT: paddd %xmm1, %xmm5
360 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
361 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
362 ; SSE2-NEXT: paddd %xmm2, %xmm7
363 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
364 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
365 ; SSE2-NEXT: paddd %xmm0, %xmm1
349366 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
350 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
351 ; SSE2-NEXT: movdqa 48(%rsi), %xmm1
352 ; SSE2-NEXT: movdqa %xmm1, %xmm4
353 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
354 ; SSE2-NEXT: movdqa %xmm4, %xmm3
355 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
356 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
357 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
358 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
359 ; SSE2-NEXT: movdqa %xmm1, %xmm3
360 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
361 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
362 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
363 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
364 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
365 ; SSE2-NEXT: paddd %xmm8, %xmm4
366 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
367 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
368 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
369 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
370 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
371 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
372 ; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload
373 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
374 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
375 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
376 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
377 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload
378 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
379 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
380 ; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
381 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
367 ; SSE2-NEXT: paddd %xmm0, %xmm10
368 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
369 ; SSE2-NEXT: paddd %xmm0, %xmm1
370 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
371 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
372 ; SSE2-NEXT: paddd %xmm0, %xmm2
373 ; SSE2-NEXT: paddd %xmm0, %xmm12
374 ; SSE2-NEXT: paddd %xmm0, %xmm4
382375 ; SSE2-NEXT: paddd %xmm0, %xmm15
376 ; SSE2-NEXT: paddd %xmm0, %xmm13
377 ; SSE2-NEXT: paddd %xmm0, %xmm9
378 ; SSE2-NEXT: paddd %xmm0, %xmm6
379 ; SSE2-NEXT: paddd %xmm0, %xmm14
380 ; SSE2-NEXT: paddd %xmm0, %xmm11
381 ; SSE2-NEXT: paddd %xmm0, %xmm8
382 ; SSE2-NEXT: paddd %xmm0, %xmm3
383 ; SSE2-NEXT: paddd %xmm0, %xmm5
383384 ; SSE2-NEXT: paddd %xmm0, %xmm7
384 ; SSE2-NEXT: paddd %xmm0, %xmm9
385 ; SSE2-NEXT: paddd %xmm0, %xmm14
386 ; SSE2-NEXT: paddd %xmm0, %xmm13
387 ; SSE2-NEXT: paddd %xmm0, %xmm6
388 ; SSE2-NEXT: paddd %xmm0, %xmm10
389 ; SSE2-NEXT: paddd %xmm0, %xmm12
390 ; SSE2-NEXT: paddd %xmm0, %xmm11
391 ; SSE2-NEXT: paddd %xmm0, %xmm5
392 ; SSE2-NEXT: paddd %xmm0, %xmm3
393 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
394 ; SSE2-NEXT: paddd %xmm0, %xmm2
395 ; SSE2-NEXT: paddd %xmm0, %xmm8
396 ; SSE2-NEXT: paddd %xmm0, %xmm4
397 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
398 ; SSE2-NEXT: paddd %xmm0, %xmm3
399 ; SSE2-NEXT: paddd %xmm0, %xmm1
400 ; SSE2-NEXT: psrld $1, %xmm7
385 ; SSE2-NEXT: psrld $1, %xmm10
386 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
387 ; SSE2-NEXT: psrld $1, %xmm1
388 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
389 ; SSE2-NEXT: pand %xmm0, %xmm1
390 ; SSE2-NEXT: pand %xmm0, %xmm10
391 ; SSE2-NEXT: packuswb %xmm1, %xmm10
392 ; SSE2-NEXT: psrld $1, %xmm2
393 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
394 ; SSE2-NEXT: psrld $1, %xmm1
395 ; SSE2-NEXT: pand %xmm0, %xmm1
396 ; SSE2-NEXT: pand %xmm0, %xmm2
397 ; SSE2-NEXT: packuswb %xmm1, %xmm2
398 ; SSE2-NEXT: packuswb %xmm10, %xmm2
399 ; SSE2-NEXT: movdqa %xmm2, %xmm1
400 ; SSE2-NEXT: psrld $1, %xmm4
401 ; SSE2-NEXT: psrld $1, %xmm12
402 ; SSE2-NEXT: pand %xmm0, %xmm12
403 ; SSE2-NEXT: pand %xmm0, %xmm4
404 ; SSE2-NEXT: packuswb %xmm12, %xmm4
405 ; SSE2-NEXT: psrld $1, %xmm13
401406 ; SSE2-NEXT: psrld $1, %xmm15
402 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
403407 ; SSE2-NEXT: pand %xmm0, %xmm15
404 ; SSE2-NEXT: pand %xmm0, %xmm7
405 ; SSE2-NEXT: packuswb %xmm15, %xmm7
406 ; SSE2-NEXT: psrld $1, %xmm14
408 ; SSE2-NEXT: pand %xmm0, %xmm13
409 ; SSE2-NEXT: packuswb %xmm15, %xmm13
410 ; SSE2-NEXT: packuswb %xmm4, %xmm13
411 ; SSE2-NEXT: psrld $1, %xmm6
407412 ; SSE2-NEXT: psrld $1, %xmm9
408413 ; SSE2-NEXT: pand %xmm0, %xmm9
414 ; SSE2-NEXT: pand %xmm0, %xmm6
415 ; SSE2-NEXT: packuswb %xmm9, %xmm6
416 ; SSE2-NEXT: psrld $1, %xmm11
417 ; SSE2-NEXT: psrld $1, %xmm14
409418 ; SSE2-NEXT: pand %xmm0, %xmm14
410 ; SSE2-NEXT: packuswb %xmm9, %xmm14
411 ; SSE2-NEXT: packuswb %xmm7, %xmm14
412 ; SSE2-NEXT: psrld $1, %xmm6
413 ; SSE2-NEXT: psrld $1, %xmm13
414 ; SSE2-NEXT: pand %xmm0, %xmm13
415 ; SSE2-NEXT: pand %xmm0, %xmm6
416 ; SSE2-NEXT: packuswb %xmm13, %xmm6
417 ; SSE2-NEXT: psrld $1, %xmm12
418 ; SSE2-NEXT: psrld $1, %xmm10
419 ; SSE2-NEXT: pand %xmm0, %xmm10
420 ; SSE2-NEXT: pand %xmm0, %xmm12
421 ; SSE2-NEXT: packuswb %xmm10, %xmm12
422 ; SSE2-NEXT: packuswb %xmm6, %xmm12
423 ; SSE2-NEXT: psrld $1, %xmm5
424 ; SSE2-NEXT: psrld $1, %xmm11
425419 ; SSE2-NEXT: pand %xmm0, %xmm11
426 ; SSE2-NEXT: pand %xmm0, %xmm5
427 ; SSE2-NEXT: packuswb %xmm11, %xmm5
428 ; SSE2-NEXT: psrld $1, %xmm2
429 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
430 ; SSE2-NEXT: psrld $1, %xmm6
431 ; SSE2-NEXT: pand %xmm0, %xmm6
432 ; SSE2-NEXT: pand %xmm0, %xmm2
433 ; SSE2-NEXT: packuswb %xmm6, %xmm2
434 ; SSE2-NEXT: packuswb %xmm5, %xmm2
435 ; SSE2-NEXT: psrld $1, %xmm4
436 ; SSE2-NEXT: movdqa %xmm8, %xmm5
420 ; SSE2-NEXT: packuswb %xmm14, %xmm11
421 ; SSE2-NEXT: packuswb %xmm6, %xmm11
422 ; SSE2-NEXT: psrld $1, %xmm3
423 ; SSE2-NEXT: psrld $1, %xmm8
424 ; SSE2-NEXT: pand %xmm0, %xmm8
425 ; SSE2-NEXT: pand %xmm0, %xmm3
426 ; SSE2-NEXT: packuswb %xmm8, %xmm3
427 ; SSE2-NEXT: psrld $1, %xmm7
437428 ; SSE2-NEXT: psrld $1, %xmm5
438429 ; SSE2-NEXT: pand %xmm0, %xmm5
439 ; SSE2-NEXT: pand %xmm0, %xmm4
440 ; SSE2-NEXT: packuswb %xmm5, %xmm4
441 ; SSE2-NEXT: psrld $1, %xmm1
442 ; SSE2-NEXT: movdqa %xmm3, %xmm5
443 ; SSE2-NEXT: psrld $1, %xmm5
444 ; SSE2-NEXT: pand %xmm0, %xmm5
445 ; SSE2-NEXT: pand %xmm0, %xmm1
446 ; SSE2-NEXT: packuswb %xmm5, %xmm1
447 ; SSE2-NEXT: packuswb %xmm4, %xmm1
430 ; SSE2-NEXT: pand %xmm0, %xmm7
431 ; SSE2-NEXT: packuswb %xmm5, %xmm7
432 ; SSE2-NEXT: packuswb %xmm3, %xmm7
433 ; SSE2-NEXT: movdqu %xmm7, (%rax)
434 ; SSE2-NEXT: movdqu %xmm11, (%rax)
435 ; SSE2-NEXT: movdqu %xmm13, (%rax)
448436 ; SSE2-NEXT: movdqu %xmm1, (%rax)
449 ; SSE2-NEXT: movdqu %xmm2, (%rax)
450 ; SSE2-NEXT: movdqu %xmm12, (%rax)
451 ; SSE2-NEXT: movdqu %xmm14, (%rax)
452 ; SSE2-NEXT: addq $152, %rsp
453437 ; SSE2-NEXT: retq
454438 ;
455439 ; AVX2-LABEL: avg_v64i8:
463447 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
464448 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
465449 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
466 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
467 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
468 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
469 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
470 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
471 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
472 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
473 ; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7
474 ; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6
475 ; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5
476 ; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4
477 ; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3
478 ; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2
479 ; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1
480450 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
451 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
452 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
453 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
454 ; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
455 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
456 ; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
457 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
458 ; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
459 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
460 ; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
461 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
462 ; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
463 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
464 ; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7
481465 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8
482466 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9
483467 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10
539523 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
540524 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
541525 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
542 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
543 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
544 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
545 ; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3
546 ; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2
547 ; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1
548526 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
527 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
528 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
529 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
530 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2
531 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
532 ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3
549533 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4
550534 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0
551535 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1
672656 define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
673657 ; SSE2-LABEL: avg_v16i16:
674658 ; SSE2: # BB#0:
675 ; SSE2-NEXT: movdqa (%rdi), %xmm4
676 ; SSE2-NEXT: movdqa 16(%rdi), %xmm5
659 ; SSE2-NEXT: movdqa (%rdi), %xmm2
660 ; SSE2-NEXT: movdqa 16(%rdi), %xmm4
677661 ; SSE2-NEXT: movdqa (%rsi), %xmm0
678662 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
679 ; SSE2-NEXT: pxor %xmm6, %xmm6
680 ; SSE2-NEXT: movdqa %xmm4, %xmm8
681 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
682 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
683 ; SSE2-NEXT: movdqa %xmm5, %xmm7
684 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
685 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
663 ; SSE2-NEXT: pxor %xmm5, %xmm5
664 ; SSE2-NEXT: movdqa %xmm2, %xmm6
665 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
666 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
667 ; SSE2-NEXT: movdqa %xmm4, %xmm7
668 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
669 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
686670 ; SSE2-NEXT: movdqa %xmm0, %xmm3
687 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
688 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
671 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
672 ; SSE2-NEXT: paddd %xmm6, %xmm3
673 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
674 ; SSE2-NEXT: paddd %xmm2, %xmm0
689675 ; SSE2-NEXT: movdqa %xmm1, %xmm2
690 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
691 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
692 ; SSE2-NEXT: paddd %xmm5, %xmm1
676 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
693677 ; SSE2-NEXT: paddd %xmm7, %xmm2
694 ; SSE2-NEXT: paddd %xmm4, %xmm0
695 ; SSE2-NEXT: paddd %xmm8, %xmm3
678 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
679 ; SSE2-NEXT: paddd %xmm4, %xmm1
696680 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
697681 ; SSE2-NEXT: paddd %xmm4, %xmm3
698682 ; SSE2-NEXT: paddd %xmm4, %xmm0
754738 define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
755739 ; SSE2-LABEL: avg_v32i16:
756740 ; SSE2: # BB#0:
757 ; SSE2-NEXT: movdqa (%rdi), %xmm10
758 ; SSE2-NEXT: movdqa 16(%rdi), %xmm9
759 ; SSE2-NEXT: movdqa 32(%rdi), %xmm11
741 ; SSE2-NEXT: movdqa (%rdi), %xmm4
742 ; SSE2-NEXT: movdqa 16(%rdi), %xmm11
743 ; SSE2-NEXT: movdqa 32(%rdi), %xmm10
760744 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8
761 ; SSE2-NEXT: movdqa (%rsi), %xmm14
745 ; SSE2-NEXT: movdqa (%rsi), %xmm9
762746 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
763747 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
764748 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
765749 ; SSE2-NEXT: pxor %xmm0, %xmm0
766 ; SSE2-NEXT: movdqa %xmm10, %xmm4
767 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
768 ; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
750 ; SSE2-NEXT: movdqa %xmm4, %xmm6
751 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
752 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
753 ; SSE2-NEXT: movdqa %xmm11, %xmm5
754 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
755 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
756 ; SSE2-NEXT: movdqa %xmm10, %xmm12
757 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
769758 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
770 ; SSE2-NEXT: movdqa %xmm9, %xmm12
771 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
772 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
773 ; SSE2-NEXT: movdqa %xmm11, %xmm15
774 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
775 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
776759 ; SSE2-NEXT: movdqa %xmm8, %xmm13
777760 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
778761 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
779 ; SSE2-NEXT: movdqa %xmm14, %xmm7
762 ; SSE2-NEXT: movdqa %xmm9, %xmm7
780763 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
781 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
764 ; SSE2-NEXT: paddd %xmm6, %xmm7
765 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
766 ; SSE2-NEXT: paddd %xmm4, %xmm9
782767 ; SSE2-NEXT: movdqa %xmm1, %xmm6
783768 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
769 ; SSE2-NEXT: paddd %xmm5, %xmm6
784770 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
771 ; SSE2-NEXT: paddd %xmm11, %xmm1
785772 ; SSE2-NEXT: movdqa %xmm2, %xmm5
786773 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
774 ; SSE2-NEXT: paddd %xmm12, %xmm5
787775 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
776 ; SSE2-NEXT: paddd %xmm10, %xmm2
788777 ; SSE2-NEXT: movdqa %xmm3, %xmm4
789778 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
779 ; SSE2-NEXT: paddd %xmm13, %xmm4
790780 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
791781 ; SSE2-NEXT: paddd %xmm8, %xmm3
792 ; SSE2-NEXT: paddd %xmm13, %xmm4
793 ; SSE2-NEXT: paddd %xmm11, %xmm2
794 ; SSE2-NEXT: paddd %xmm15, %xmm5
795 ; SSE2-NEXT: paddd %xmm9, %xmm1
796 ; SSE2-NEXT: paddd %xmm12, %xmm6
797 ; SSE2-NEXT: paddd %xmm10, %xmm14
798 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
799782 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
800783 ; SSE2-NEXT: paddd %xmm0, %xmm7
801 ; SSE2-NEXT: paddd %xmm0, %xmm14
784 ; SSE2-NEXT: paddd %xmm0, %xmm9
802785 ; SSE2-NEXT: paddd %xmm0, %xmm6
803786 ; SSE2-NEXT: paddd %xmm0, %xmm1
804787 ; SSE2-NEXT: paddd %xmm0, %xmm5
805788 ; SSE2-NEXT: paddd %xmm0, %xmm2
806789 ; SSE2-NEXT: paddd %xmm0, %xmm4
807790 ; SSE2-NEXT: paddd %xmm0, %xmm3
808 ; SSE2-NEXT: psrld $1, %xmm14
791 ; SSE2-NEXT: psrld $1, %xmm3
792 ; SSE2-NEXT: psrld $1, %xmm4
793 ; SSE2-NEXT: psrld $1, %xmm2
794 ; SSE2-NEXT: psrld $1, %xmm5
795 ; SSE2-NEXT: psrld $1, %xmm1
796 ; SSE2-NEXT: psrld $1, %xmm6
797 ; SSE2-NEXT: psrld $1, %xmm9
809798 ; SSE2-NEXT: psrld $1, %xmm7
810799 ; SSE2-NEXT: pslld $16, %xmm7
811800 ; SSE2-NEXT: psrad $16, %xmm7
812 ; SSE2-NEXT: pslld $16, %xmm14
813 ; SSE2-NEXT: psrad $16, %xmm14
814 ; SSE2-NEXT: packssdw %xmm7, %xmm14
815 ; SSE2-NEXT: psrld $1, %xmm1
816 ; SSE2-NEXT: psrld $1, %xmm6
801 ; SSE2-NEXT: pslld $16, %xmm9
802 ; SSE2-NEXT: psrad $16, %xmm9
803 ; SSE2-NEXT: packssdw %xmm7, %xmm9
817804 ; SSE2-NEXT: pslld $16, %xmm6
818805 ; SSE2-NEXT: psrad $16, %xmm6
819806 ; SSE2-NEXT: pslld $16, %xmm1
820807 ; SSE2-NEXT: psrad $16, %xmm1
821808 ; SSE2-NEXT: packssdw %xmm6, %xmm1
822 ; SSE2-NEXT: psrld $1, %xmm2
823 ; SSE2-NEXT: psrld $1, %xmm5
824809 ; SSE2-NEXT: pslld $16, %xmm5
825810 ; SSE2-NEXT: psrad $16, %xmm5
826811 ; SSE2-NEXT: pslld $16, %xmm2
827812 ; SSE2-NEXT: psrad $16, %xmm2
828813 ; SSE2-NEXT: packssdw %xmm5, %xmm2
829 ; SSE2-NEXT: psrld $1, %xmm3
830 ; SSE2-NEXT: psrld $1, %xmm4
831814 ; SSE2-NEXT: pslld $16, %xmm4
832815 ; SSE2-NEXT: psrad $16, %xmm4
833816 ; SSE2-NEXT: pslld $16, %xmm3
836819 ; SSE2-NEXT: movdqu %xmm3, (%rax)
837820 ; SSE2-NEXT: movdqu %xmm2, (%rax)
838821 ; SSE2-NEXT: movdqu %xmm1, (%rax)
839 ; SSE2-NEXT: movdqu %xmm14, (%rax)
822 ; SSE2-NEXT: movdqu %xmm9, (%rax)
840823 ; SSE2-NEXT: retq
841824 ;
842825 ; AVX2-LABEL: avg_v32i16:
846829 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
847830 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
848831 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
849 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
850 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
851 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
852 ; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
853 ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
854 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
855832 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
833 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
834 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
835 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
836 ; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
837 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
838 ; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
856839 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4
857840 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
858841 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
883866 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
884867 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
885868 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
886 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
887 ; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
888869 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
870 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
871 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
889872 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
890873 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
891874 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
10461029 define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
10471030 ; SSE2-LABEL: avg_v32i8_2:
10481031 ; SSE2: # BB#0:
1049 ; SSE2-NEXT: movdqa (%rdi), %xmm8
1050 ; SSE2-NEXT: movdqa 16(%rdi), %xmm11
1032 ; SSE2-NEXT: movdqa (%rdi), %xmm3
1033 ; SSE2-NEXT: movdqa 16(%rdi), %xmm8
10511034 ; SSE2-NEXT: movdqa (%rsi), %xmm0
10521035 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
10531036 ; SSE2-NEXT: pxor %xmm4, %xmm4
1037 ; SSE2-NEXT: movdqa %xmm3, %xmm5
1038 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
1039 ; SSE2-NEXT: movdqa %xmm5, %xmm6
1040 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1041 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1042 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1043 ; SSE2-NEXT: movdqa %xmm3, %xmm12
1044 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
1045 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1046 ; SSE2-NEXT: movdqa %xmm8, %xmm7
1047 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
1048 ; SSE2-NEXT: movdqa %xmm7, %xmm11
1049 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1050 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1051 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
10541052 ; SSE2-NEXT: movdqa %xmm8, %xmm10
1055 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
1056 ; SSE2-NEXT: movdqa %xmm10, %xmm2
1057 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1058 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
1059 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
1060 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
1061 ; SSE2-NEXT: movdqa %xmm8, %xmm12
1062 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
1053 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
10631054 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
1064 ; SSE2-NEXT: movdqa %xmm11, %xmm15
1065 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
1066 ; SSE2-NEXT: movdqa %xmm15, %xmm14
1067 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
1068 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
1069 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1070 ; SSE2-NEXT: movdqa %xmm11, %xmm9
1055 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1056 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1057 ; SSE2-NEXT: movdqa %xmm2, %xmm9
10711058 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
1072 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
1073 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1059 ; SSE2-NEXT: paddd %xmm6, %xmm9
1060 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1061 ; SSE2-NEXT: paddd %xmm5, %xmm2
1062 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1063 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1064 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1065 ; SSE2-NEXT: paddd %xmm12, %xmm5
1066 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1067 ; SSE2-NEXT: paddd %xmm3, %xmm0
1068 ; SSE2-NEXT: movdqa %xmm1, %xmm3
10741069 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
1075 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1070 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1071 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1072 ; SSE2-NEXT: paddd %xmm11, %xmm6
1073 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1074 ; SSE2-NEXT: paddd %xmm7, %xmm3
1075 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1076 ; SSE2-NEXT: movdqa %xmm1, %xmm7
10761077 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
1077 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
1078 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1079 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1080 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1081 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1082 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1083 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1084 ; SSE2-NEXT: movdqa %xmm2, %xmm5
1085 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1086 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1087 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1088 ; SSE2-NEXT: movdqa %xmm1, %xmm13
1089 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
1078 ; SSE2-NEXT: paddd %xmm10, %xmm7
10901079 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1091 ; SSE2-NEXT: paddd %xmm11, %xmm1
1092 ; SSE2-NEXT: paddd %xmm9, %xmm13
1093 ; SSE2-NEXT: paddd %xmm15, %xmm2
1094 ; SSE2-NEXT: paddd %xmm14, %xmm5
1095 ; SSE2-NEXT: paddd %xmm8, %xmm0
1096 ; SSE2-NEXT: paddd %xmm12, %xmm6
1097 ; SSE2-NEXT: paddd %xmm10, %xmm3
1098 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
1080 ; SSE2-NEXT: paddd %xmm8, %xmm1
10991081 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
1082 ; SSE2-NEXT: paddd %xmm4, %xmm9
1083 ; SSE2-NEXT: paddd %xmm4, %xmm2
1084 ; SSE2-NEXT: paddd %xmm4, %xmm5
1085 ; SSE2-NEXT: paddd %xmm4, %xmm0
1086 ; SSE2-NEXT: paddd %xmm4, %xmm6
1087 ; SSE2-NEXT: paddd %xmm4, %xmm3
11001088 ; SSE2-NEXT: paddd %xmm4, %xmm7
1101 ; SSE2-NEXT: paddd %xmm4, %xmm3
1102 ; SSE2-NEXT: paddd %xmm4, %xmm6
1103 ; SSE2-NEXT: paddd %xmm4, %xmm0
1104 ; SSE2-NEXT: paddd %xmm4, %xmm5
1105 ; SSE2-NEXT: paddd %xmm4, %xmm2
1106 ; SSE2-NEXT: paddd %xmm4, %xmm13
11071089 ; SSE2-NEXT: paddd %xmm4, %xmm1
1090 ; SSE2-NEXT: psrld $1, %xmm1
1091 ; SSE2-NEXT: psrld $1, %xmm7
11081092 ; SSE2-NEXT: psrld $1, %xmm3
1109 ; SSE2-NEXT: psrld $1, %xmm7
1093 ; SSE2-NEXT: psrld $1, %xmm6
1094 ; SSE2-NEXT: psrld $1, %xmm0
1095 ; SSE2-NEXT: psrld $1, %xmm5
1096 ; SSE2-NEXT: psrld $1, %xmm2
1097 ; SSE2-NEXT: psrld $1, %xmm9
11101098 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
1099 ; SSE2-NEXT: pand %xmm4, %xmm9
1100 ; SSE2-NEXT: pand %xmm4, %xmm2
1101 ; SSE2-NEXT: packuswb %xmm9, %xmm2
1102 ; SSE2-NEXT: pand %xmm4, %xmm5
1103 ; SSE2-NEXT: pand %xmm4, %xmm0
1104 ; SSE2-NEXT: packuswb %xmm5, %xmm0
1105 ; SSE2-NEXT: packuswb %xmm2, %xmm0
1106 ; SSE2-NEXT: pand %xmm4, %xmm6
1107 ; SSE2-NEXT: pand %xmm4, %xmm3
1108 ; SSE2-NEXT: packuswb %xmm6, %xmm3
11111109 ; SSE2-NEXT: pand %xmm4, %xmm7
1112 ; SSE2-NEXT: pand %xmm4, %xmm3
1113 ; SSE2-NEXT: packuswb %xmm7, %xmm3
1114 ; SSE2-NEXT: psrld $1, %xmm0
1115 ; SSE2-NEXT: psrld $1, %xmm6
1116 ; SSE2-NEXT: pand %xmm4, %xmm6
1117 ; SSE2-NEXT: pand %xmm4, %xmm0
1118 ; SSE2-NEXT: packuswb %xmm6, %xmm0
1119 ; SSE2-NEXT: packuswb %xmm3, %xmm0
1120 ; SSE2-NEXT: psrld $1, %xmm2
1121 ; SSE2-NEXT: psrld $1, %xmm5
1122 ; SSE2-NEXT: pand %xmm4, %xmm5
1123 ; SSE2-NEXT: pand %xmm4, %xmm2
1124 ; SSE2-NEXT: packuswb %xmm5, %xmm2
1125 ; SSE2-NEXT: psrld $1, %xmm1
1126 ; SSE2-NEXT: psrld $1, %xmm13
1127 ; SSE2-NEXT: pand %xmm4, %xmm13
11281110 ; SSE2-NEXT: pand %xmm4, %xmm1
1129 ; SSE2-NEXT: packuswb %xmm13, %xmm1
1130 ; SSE2-NEXT: packuswb %xmm2, %xmm1
1111 ; SSE2-NEXT: packuswb %xmm7, %xmm1
1112 ; SSE2-NEXT: packuswb %xmm3, %xmm1
11311113 ; SSE2-NEXT: movdqu %xmm1, (%rax)
11321114 ; SSE2-NEXT: movdqu %xmm0, (%rax)
11331115 ; SSE2-NEXT: retq
15111493 define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
15121494 ; SSE2-LABEL: avg_v16i16_2:
15131495 ; SSE2: # BB#0:
1514 ; SSE2-NEXT: movdqa (%rdi), %xmm4
1515 ; SSE2-NEXT: movdqa 16(%rdi), %xmm5
1496 ; SSE2-NEXT: movdqa (%rdi), %xmm2
1497 ; SSE2-NEXT: movdqa 16(%rdi), %xmm4
15161498 ; SSE2-NEXT: movdqa (%rsi), %xmm0
15171499 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
1518 ; SSE2-NEXT: pxor %xmm6, %xmm6
1519 ; SSE2-NEXT: movdqa %xmm4, %xmm8
1520 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
1521 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
1522 ; SSE2-NEXT: movdqa %xmm5, %xmm7
1523 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1524 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
1500 ; SSE2-NEXT: pxor %xmm5, %xmm5
1501 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1502 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1503 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
1504 ; SSE2-NEXT: movdqa %xmm4, %xmm7
1505 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
1506 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
15251507 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1526 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
1527 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
1508 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1509 ; SSE2-NEXT: paddd %xmm6, %xmm3
1510 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
1511 ; SSE2-NEXT: paddd %xmm2, %xmm0
15281512 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1529 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
1530 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
1531 ; SSE2-NEXT: paddd %xmm5, %xmm1
1513 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
15321514 ; SSE2-NEXT: paddd %xmm7, %xmm2
1533 ; SSE2-NEXT: paddd %xmm4, %xmm0
1534 ; SSE2-NEXT: paddd %xmm8, %xmm3
1515 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
1516 ; SSE2-NEXT: paddd %xmm4, %xmm1
15351517 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
15361518 ; SSE2-NEXT: paddd %xmm4, %xmm3
15371519 ; SSE2-NEXT: paddd %xmm4, %xmm0
15931575 define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
15941576 ; SSE2-LABEL: avg_v32i16_2:
15951577 ; SSE2: # BB#0:
1596 ; SSE2-NEXT: movdqa (%rdi), %xmm10
1597 ; SSE2-NEXT: movdqa 16(%rdi), %xmm9
1598 ; SSE2-NEXT: movdqa 32(%rdi), %xmm11
1578 ; SSE2-NEXT: movdqa (%rdi), %xmm4
1579 ; SSE2-NEXT: movdqa 16(%rdi), %xmm11
1580 ; SSE2-NEXT: movdqa 32(%rdi), %xmm10
15991581 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8
1600 ; SSE2-NEXT: movdqa (%rsi), %xmm14
1582 ; SSE2-NEXT: movdqa (%rsi), %xmm9
16011583 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1
16021584 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2
16031585 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3
16041586 ; SSE2-NEXT: pxor %xmm0, %xmm0
1605 ; SSE2-NEXT: movdqa %xmm10, %xmm4
1606 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1607 ; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
1587 ; SSE2-NEXT: movdqa %xmm4, %xmm6
1588 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
1589 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1590 ; SSE2-NEXT: movdqa %xmm11, %xmm5
1591 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1592 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
1593 ; SSE2-NEXT: movdqa %xmm10, %xmm12
1594 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
16081595 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
1609 ; SSE2-NEXT: movdqa %xmm9, %xmm12
1610 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
1611 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
1612 ; SSE2-NEXT: movdqa %xmm11, %xmm15
1613 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
1614 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
16151596 ; SSE2-NEXT: movdqa %xmm8, %xmm13
16161597 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
16171598 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
1618 ; SSE2-NEXT: movdqa %xmm14, %xmm7
1599 ; SSE2-NEXT: movdqa %xmm9, %xmm7
16191600 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
1620 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
1601 ; SSE2-NEXT: paddd %xmm6, %xmm7
1602 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
1603 ; SSE2-NEXT: paddd %xmm4, %xmm9
16211604 ; SSE2-NEXT: movdqa %xmm1, %xmm6
16221605 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
1606 ; SSE2-NEXT: paddd %xmm5, %xmm6
16231607 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1608 ; SSE2-NEXT: paddd %xmm11, %xmm1
16241609 ; SSE2-NEXT: movdqa %xmm2, %xmm5
16251610 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1611 ; SSE2-NEXT: paddd %xmm12, %xmm5
16261612 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1613 ; SSE2-NEXT: paddd %xmm10, %xmm2
16271614 ; SSE2-NEXT: movdqa %xmm3, %xmm4
16281615 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1616 ; SSE2-NEXT: paddd %xmm13, %xmm4
16291617 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
16301618 ; SSE2-NEXT: paddd %xmm8, %xmm3
1631 ; SSE2-NEXT: paddd %xmm13, %xmm4
1632 ; SSE2-NEXT: paddd %xmm11, %xmm2
1633 ; SSE2-NEXT: paddd %xmm15, %xmm5
1634 ; SSE2-NEXT: paddd %xmm9, %xmm1
1635 ; SSE2-NEXT: paddd %xmm12, %xmm6
1636 ; SSE2-NEXT: paddd %xmm10, %xmm14
1637 ; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
16381619 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
16391620 ; SSE2-NEXT: paddd %xmm0, %xmm7
1640 ; SSE2-NEXT: paddd %xmm0, %xmm14
1621 ; SSE2-NEXT: paddd %xmm0, %xmm9
16411622 ; SSE2-NEXT: paddd %xmm0, %xmm6
16421623 ; SSE2-NEXT: paddd %xmm0, %xmm1
16431624 ; SSE2-NEXT: paddd %xmm0, %xmm5
16441625 ; SSE2-NEXT: paddd %xmm0, %xmm2
16451626 ; SSE2-NEXT: paddd %xmm0, %xmm4
16461627 ; SSE2-NEXT: paddd %xmm0, %xmm3
1647 ; SSE2-NEXT: psrld $1, %xmm14
1628 ; SSE2-NEXT: psrld $1, %xmm3
1629 ; SSE2-NEXT: psrld $1, %xmm4
1630 ; SSE2-NEXT: psrld $1, %xmm2
1631 ; SSE2-NEXT: psrld $1, %xmm5
1632 ; SSE2-NEXT: psrld $1, %xmm1
1633 ; SSE2-NEXT: psrld $1, %xmm6
1634 ; SSE2-NEXT: psrld $1, %xmm9
16481635 ; SSE2-NEXT: psrld $1, %xmm7
16491636 ; SSE2-NEXT: pslld $16, %xmm7
16501637 ; SSE2-NEXT: psrad $16, %xmm7
1651 ; SSE2-NEXT: pslld $16, %xmm14
1652 ; SSE2-NEXT: psrad $16, %xmm14
1653 ; SSE2-NEXT: packssdw %xmm7, %xmm14
1654 ; SSE2-NEXT: psrld $1, %xmm1
1655 ; SSE2-NEXT: psrld $1, %xmm6
1638 ; SSE2-NEXT: pslld $16, %xmm9
1639 ; SSE2-NEXT: psrad $16, %xmm9
1640 ; SSE2-NEXT: packssdw %xmm7, %xmm9
16561641 ; SSE2-NEXT: pslld $16, %xmm6
16571642 ; SSE2-NEXT: psrad $16, %xmm6
16581643 ; SSE2-NEXT: pslld $16, %xmm1
16591644 ; SSE2-NEXT: psrad $16, %xmm1
16601645 ; SSE2-NEXT: packssdw %xmm6, %xmm1
1661 ; SSE2-NEXT: psrld $1, %xmm2
1662 ; SSE2-NEXT: psrld $1, %xmm5
16631646 ; SSE2-NEXT: pslld $16, %xmm5
16641647 ; SSE2-NEXT: psrad $16, %xmm5
16651648 ; SSE2-NEXT: pslld $16, %xmm2
16661649 ; SSE2-NEXT: psrad $16, %xmm2
16671650 ; SSE2-NEXT: packssdw %xmm5, %xmm2
1668 ; SSE2-NEXT: psrld $1, %xmm3
1669 ; SSE2-NEXT: psrld $1, %xmm4
16701651 ; SSE2-NEXT: pslld $16, %xmm4
16711652 ; SSE2-NEXT: psrad $16, %xmm4
16721653 ; SSE2-NEXT: pslld $16, %xmm3
16751656 ; SSE2-NEXT: movdqu %xmm3, (%rax)
16761657 ; SSE2-NEXT: movdqu %xmm2, (%rax)
16771658 ; SSE2-NEXT: movdqu %xmm1, (%rax)
1678 ; SSE2-NEXT: movdqu %xmm14, (%rax)
1659 ; SSE2-NEXT: movdqu %xmm9, (%rax)
16791660 ; SSE2-NEXT: retq
16801661 ;
16811662 ; AVX2-LABEL: avg_v32i16_2:
16851666 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
16861667 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
16871668 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1688 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1689 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1690 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1691 ; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
1692 ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
1693 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
16941669 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
1670 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1671 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
1672 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1673 ; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
1674 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1675 ; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3
16951676 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4
16961677 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
16971678 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1
17221703 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
17231704 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
17241705 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1725 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1726 ; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
17271706 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
1707 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
1708 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
17281709 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
17291710 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
17301711 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1
112112 ; CHECK-NOT: mov
113113 ; CHECK: insertps $48
114114 ; CHECK: insertps $48
115 ; CHECK: vaddps
115116 ; CHECK: insertps $48
116117 ; CHECK: insertps $48
117 ; CHECK: vaddps
118118 ; CHECK: vaddps
119119 ; CHECK: vaddps
120120 ; CHECK-NEXT: ret
1212 ; CHECK: # BB#0: # %entry
1313 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0
1414 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1
15 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2
16 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3
1715 ; CHECK-NEXT: korw %k1, %k0, %k0
18 ; CHECK-NEXT: korw %k3, %k2, %k1
16 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1
17 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2
18 ; CHECK-NEXT: korw %k2, %k1, %k1
1919 ; CHECK-NEXT: korw %k1, %k0, %k0
2020 ; CHECK-NEXT: kmovw %k0, %eax
2121 ; CHECK-NEXT: # kill: %AX %AX %EAX
851851 ; CHECK-NEXT: kxorw %k0, %k0, %k1
852852 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
853853 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
854 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
854855 ; CHECK-NEXT: movw $1, %ax
855856 ; CHECK-NEXT: kmovd %eax, %k1
856 ; CHECK-NEXT: vmovaps %zmm1, %zmm4
857 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
857 ; CHECK-NEXT: vmovaps %zmm1, %zmm3
858 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
858859 ; CHECK-NEXT: movw $220, %ax
859860 ; CHECK-NEXT: kmovd %eax, %k1
860861 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
861 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
862 ; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1
863 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
862 ; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0
863 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
864864 ; CHECK-NEXT: retq
865865 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4)
866866 %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4)
88 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2
99 ; CHECK-NEXT: kmovw %edi, %k1
1010 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
11 ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
1112 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
12 ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
1313 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1414 ; CHECK-NEXT: retq
1515
2929 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2
3030 ; CHECK-NEXT: kmovw %edi, %k1
3131 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
32 ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
3233 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
33 ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
3434 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
3535 ; CHECK-NEXT: retq
3636
5050 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
5151 ; CHECK-NEXT: kmovw %edi, %k1
5252 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
53 ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
5354 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
54 ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
5555 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
5656 ; CHECK-NEXT: retq
5757 %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
7070 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
7171 ; CHECK-NEXT: kmovw %edi, %k1
7272 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
73 ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
7374 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
74 ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
7575 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
7676 ; CHECK-NEXT: retq
7777 %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
9090 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
9191 ; CHECK-NEXT: kmovw %edi, %k1
9292 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
93 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
9394 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
94 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
9595 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
9696 ; CHECK-NEXT: retq
9797 %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
110110 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
111111 ; CHECK-NEXT: kmovw %edi, %k1
112112 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
113 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
113114 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
114 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
115115 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
116116 ; CHECK-NEXT: retq
117117 %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
130130 ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
131131 ; CHECK-NEXT: kmovw %edi, %k1
132132 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
133 ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
133134 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
134 ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
135135 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
136136 ; CHECK-NEXT: retq
137137 %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
670670 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
671671 ; CHECK-NEXT: kmovw %edi, %k1
672672 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
673 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
673674 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
674 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
675 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
675 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
676676 ; CHECK-NEXT: retq
677677 %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
678678 %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
16151615 ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
16161616 ; CHECK-NEXT: kmovw %edi, %k1
16171617 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
1618 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2
16181619 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
1619 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
1620 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
1620 ; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
16211621 ; CHECK-NEXT: retq
16221622 %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
16231623 %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
20302030 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2
20312031 ; CHECK-NEXT: kmovw %esi, %k1
20322032 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1}
2033 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
20332034 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
2034 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
20352035 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
20362036 ; CHECK-NEXT: retq
20372037 %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
20502050 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2
20512051 ; CHECK-NEXT: kmovw %esi, %k1
20522052 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1}
2053 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
20532054 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z}
2054 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
20552055 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
20562056 ; CHECK-NEXT: retq
20572057 %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
26502650 ; CHECK-NEXT: kmovw %edi, %k1
26512651 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
26522652 ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
2653 ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
26532654 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
2654 ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
26552655 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
26562656 ; CHECK-NEXT: retq
26572657 %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3)
29882988 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
29892989 ; CHECK-NEXT: kmovw %edi, %k1
29902990 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
2991 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2
29912992 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
2992 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
2993 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
2993 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
29942994 ; CHECK-NEXT: retq
29952995 %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
29962996 %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
30093009 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
30103010 ; CHECK-NEXT: kmovw %edi, %k1
30113011 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
3012 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2
30123013 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
3013 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
3014 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
3014 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
30153015 ; CHECK-NEXT: retq
30163016 %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
30173017 %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
30293029 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
30303030 ; CHECK-NEXT: kmovw %edi, %k1
30313031 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
3032 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2
30323033 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
3033 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
3034 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
3034 ; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
30353035 ; CHECK-NEXT: retq
30363036 %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
30373037 %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
30493049 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
30503050 ; CHECK-NEXT: kmovw %edi, %k1
30513051 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
3052 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2
30523053 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
3053 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
3054 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
3054 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
30553055 ; CHECK-NEXT: retq
30563056 %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
30573057 %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
478478 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
479479 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
480480 ; CHECK: ## BB#0:
481 ; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx
482 ; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax
483 ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx
481 ; CHECK-NEXT: vcvtsd2usi %xmm0, %rax
482 ; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx
483 ; CHECK-NEXT: addq %rax, %rcx
484 ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax
484485 ; CHECK-NEXT: addq %rcx, %rax
485 ; CHECK-NEXT: addq %rdx, %rax
486486 ; CHECK-NEXT: retq
487487
488488 %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4)
497497 define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
498498 ; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
499499 ; CHECK: ## BB#0:
500 ; CHECK-NEXT: vcvtsd2si %xmm0, %rcx
501 ; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax
502 ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx
500 ; CHECK-NEXT: vcvtsd2si %xmm0, %rax
501 ; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx
502 ; CHECK-NEXT: addq %rax, %rcx
503 ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax
503504 ; CHECK-NEXT: addq %rcx, %rax
504 ; CHECK-NEXT: addq %rdx, %rax
505505 ; CHECK-NEXT: retq
506506
507507 %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4)
516516 define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
517517 ; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
518518 ; CHECK: ## BB#0:
519 ; CHECK-NEXT: vcvtss2usi %xmm0, %rcx
520 ; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax
521 ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx
519 ; CHECK-NEXT: vcvtss2usi %xmm0, %rax
520 ; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx
521 ; CHECK-NEXT: addq %rax, %rcx
522 ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax
522523 ; CHECK-NEXT: addq %rcx, %rax
523 ; CHECK-NEXT: addq %rdx, %rax
524524 ; CHECK-NEXT: retq
525525
526526 %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4)
535535 define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
536536 ; CHECK-LABEL: test_x86_avx512_cvtss2si64:
537537 ; CHECK: ## BB#0:
538 ; CHECK-NEXT: vcvtss2si %xmm0, %rcx
539 ; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax
540 ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx
538 ; CHECK-NEXT: vcvtss2si %xmm0, %rax
539 ; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx
540 ; CHECK-NEXT: addq %rax, %rcx
541 ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax
541542 ; CHECK-NEXT: addq %rcx, %rax
542 ; CHECK-NEXT: addq %rdx, %rax
543543 ; CHECK-NEXT: retq
544544
545545 %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4)
554554 define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
555555 ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
556556 ; CHECK: ## BB#0:
557 ; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx
558 ; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax
559 ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx
557 ; CHECK-NEXT: vcvtsd2usi %xmm0, %eax
558 ; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx
559 ; CHECK-NEXT: addl %eax, %ecx
560 ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax
560561 ; CHECK-NEXT: addl %ecx, %eax
561 ; CHECK-NEXT: addl %edx, %eax
562562 ; CHECK-NEXT: retq
563563
564564 %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
573573 define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
574574 ; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
575575 ; CHECK: ## BB#0:
576 ; CHECK-NEXT: vcvtsd2si %xmm0, %ecx
577 ; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax
578 ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx
576 ; CHECK-NEXT: vcvtsd2si %xmm0, %eax
577 ; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx
578 ; CHECK-NEXT: addl %eax, %ecx
579 ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax
579580 ; CHECK-NEXT: addl %ecx, %eax
580 ; CHECK-NEXT: addl %edx, %eax
581581 ; CHECK-NEXT: retq
582582
583583 %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
592592 define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
593593 ; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
594594 ; CHECK: ## BB#0:
595 ; CHECK-NEXT: vcvtss2usi %xmm0, %ecx
596 ; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax
597 ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx
595 ; CHECK-NEXT: vcvtss2usi %xmm0, %eax
596 ; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx
597 ; CHECK-NEXT: addl %eax, %ecx
598 ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax
598599 ; CHECK-NEXT: addl %ecx, %eax
599 ; CHECK-NEXT: addl %edx, %eax
600600 ; CHECK-NEXT: retq
601601
602602 %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
611611 define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
612612 ; CHECK-LABEL: test_x86_avx512_cvtss2si32:
613613 ; CHECK: ## BB#0:
614 ; CHECK-NEXT: vcvtss2si %xmm0, %ecx
615 ; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax
616 ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx
614 ; CHECK-NEXT: vcvtss2si %xmm0, %eax
615 ; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx
616 ; CHECK-NEXT: addl %eax, %ecx
617 ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax
617618 ; CHECK-NEXT: addl %ecx, %eax
618 ; CHECK-NEXT: addl %edx, %eax
619619 ; CHECK-NEXT: retq
620620
621621 %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
684684 ; CHECK-NEXT: kmovw %edi, %k1
685685 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
686686 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
687 ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1
687688 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
688 ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0
689 ; CHECK-NEXT: vmovdqa %ymm1, %ymm0
689690 ; CHECK-NEXT: retq
690691 %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
691692 %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
43974398 ; CHECK-NEXT: kmovw %esi, %k1
43984399 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
43994400 ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
4401 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
44004402 ; CHECK-NEXT: vprold $3, %zmm0, %zmm0
4401 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
44024403 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
44034404 ; CHECK-NEXT: retq
44044405 %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
44174418 ; CHECK-NEXT: kmovw %esi, %k1
44184419 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
44194420 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
4421 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
44204422 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm0
4421 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
44224423 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
44234424 ; CHECK-NEXT: retq
44244425 %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
45194520 ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
45204521 ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
45214522 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
4523 ; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3
45224524 ; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
4523 ; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1
4524 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4525 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
45254526 ; CHECK-NEXT: retq
45264527 %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
45274528 %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
45424543 ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
45434544 ; CHECK-NEXT: vmovapd %zmm0, %zmm5
45444545 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
4546 ; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3
45454547 ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0
4546 ; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1
4547 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
4548 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
45484549 ; CHECK-NEXT: retq
45494550 %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
45504551 %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
46114612 ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
46124613 ; CHECK-NEXT: vmovaps %zmm0, %zmm5
46134614 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
4615 ; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3
46144616 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0
4615 ; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1
4616 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
4617 ; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
46174618 ; CHECK-NEXT: retq
46184619 %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
46194620 %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
88 ; CHECK-NEXT: Lcfi0:
99 ; CHECK-NEXT: .cfi_def_cfa_offset 16
1010 ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
11 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
12 ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
11 ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
12 ; CHECK-NEXT: korw %k1, %k0, %k0
1313 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
1414 ; CHECK-NEXT: callq _f
1515 ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
16 ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
17 ; CHECK-NEXT: korw %k1, %k0, %k0
1816 ; CHECK-NEXT: vpmovm2d %k0, %xmm0
1917 ; CHECK-NEXT: popq %rax
2018 ; CHECK-NEXT: retq
3331 ; CHECK-NEXT: Lcfi1:
3432 ; CHECK-NEXT: .cfi_def_cfa_offset 16
3533 ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
36 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
37 ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
34 ; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
35 ; CHECK-NEXT: korb %k1, %k0, %k0
3836 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
3937 ; CHECK-NEXT: vzeroupper
4038 ; CHECK-NEXT: callq _f
4139 ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
42 ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
43 ; CHECK-NEXT: korb %k1, %k0, %k0
4440 ; CHECK-NEXT: vpmovm2w %k0, %xmm0
4541 ; CHECK-NEXT: popq %rax
4642 ; CHECK-NEXT: retq
5955 ; CHECK-NEXT: Lcfi2:
6056 ; CHECK-NEXT: .cfi_def_cfa_offset 16
6157 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
62 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
63 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
58 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
59 ; CHECK-NEXT: korw %k1, %k0, %k0
6460 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
6561 ; CHECK-NEXT: vzeroupper
6662 ; CHECK-NEXT: callq _f
6763 ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
68 ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
69 ; CHECK-NEXT: korw %k1, %k0, %k0
7064 ; CHECK-NEXT: vpmovm2b %k0, %xmm0
7165 ; CHECK-NEXT: popq %rax
7266 ; CHECK-NEXT: retq
8478 ; CHECK-NEXT: Lcfi3:
8579 ; CHECK-NEXT: .cfi_def_cfa_offset 16
8680 ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0
81 ; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
82 ; CHECK-NEXT: kord %k1, %k0, %k0
8783 ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
88 ; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
89 ; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill
9084 ; CHECK-NEXT: vzeroupper
9185 ; CHECK-NEXT: callq _f
9286 ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
93 ; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload
94 ; CHECK-NEXT: kord %k1, %k0, %k0
9587 ; CHECK-NEXT: vpmovm2b %k0, %ymm0
9688 ; CHECK-NEXT: popq %rax
9789 ; CHECK-NEXT: retq
10597 define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
10698 ; CHECK-LABEL: test_64i1:
10799 ; CHECK: ## BB#0:
108 ; CHECK-NEXT: subq $24, %rsp
100 ; CHECK-NEXT: pushq %rax
109101 ; CHECK-NEXT: Lcfi4:
110 ; CHECK-NEXT: .cfi_def_cfa_offset 32
102 ; CHECK-NEXT: .cfi_def_cfa_offset 16
111103 ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
112 ; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
113 ; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
114 ; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
104 ; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
105 ; CHECK-NEXT: korq %k1, %k0, %k0
106 ; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill
115107 ; CHECK-NEXT: vzeroupper
116108 ; CHECK-NEXT: callq _f
117 ; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
118 ; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
119 ; CHECK-NEXT: korq %k1, %k0, %k0
109 ; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload
120110 ; CHECK-NEXT: vpmovm2b %k0, %zmm0
121 ; CHECK-NEXT: addq $24, %rsp
111 ; CHECK-NEXT: popq %rax
122112 ; CHECK-NEXT: retq
123113
124114 %cmp_res = icmp ugt <64 x i8> %a, %b
795795 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
796796 ; AVX512BW-NEXT: kmovd %edi, %k1
797797 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
798 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
798799 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
799 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
800 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
800 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
801801 ; AVX512BW-NEXT: retq
802802 ;
803803 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
805805 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
806806 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
807807 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
808 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
808809 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
809 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
810 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
810 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
811811 ; AVX512F-32-NEXT: retl
812812 %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
813813 %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
825825 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2
826826 ; AVX512BW-NEXT: kmovd %esi, %k1
827827 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
828 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
828829 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
829 ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
830830 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
831831 ; AVX512BW-NEXT: retq
832832 ;
835835 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2
836836 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
837837 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
838 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
838839 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
839 ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
840840 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
841841 ; AVX512F-32-NEXT: retl
842842 %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
21582158 ; AVX512BW-NEXT: kmovd %edi, %k1
21592159 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
21602160 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
2161 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
21612162 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0
2162 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
2163 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
2163 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
21642164 ; AVX512BW-NEXT: retq
21652165 ;
21662166 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
21682168 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
21692169 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
21702170 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
2171 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
21712172 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0
2172 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
2173 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
2173 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
21742174 ; AVX512F-32-NEXT: retl
21752175 %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
21762176 %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
24102410 ; AVX512BW-NEXT: kmovd %edi, %k1
24112411 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
24122412 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
2413 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
24132414 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
2414 ; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1
2415 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
2415 ; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
24162416 ; AVX512BW-NEXT: retq
24172417 ;
24182418 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
24202420 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
24212421 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
24222422 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
2423 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
24232424 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0
2424 ; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1
2425 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
2425 ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
24262426 ; AVX512F-32-NEXT: retl
24272427 %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
24282428 %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
88 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0]
99 ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
1010 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
11 ; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
1112 ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
12 ; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
1313 ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
1414 ; CHECK-NEXT: retq ## encoding: [0xc3]
1515 %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
2828 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
2929 ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
3030 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
31 ; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
3132 ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
32 ; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
3333 ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3434 ; CHECK-NEXT: retq ## encoding: [0xc3]
3535 %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
4848 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
4949 ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
5050 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
51 ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
5152 ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
52 ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
5353 ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
5454 ; CHECK-NEXT: retq ## encoding: [0xc3]
5555 %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
6868 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
6969 ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
7070 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
71 ; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
7172 ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
72 ; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
7373 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
7474 ; CHECK-NEXT: retq ## encoding: [0xc3]
7575 %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
8888 ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
8989 ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
9090 ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
91 ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
9192 ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
92 ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
9393 ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
9494 ; CHECK-NEXT: retq ## encoding: [0xc3]
9595 %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
108108 ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
109109 ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
110110 ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
111 ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
111112 ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
112 ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
113113 ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
114114 ; CHECK-NEXT: retq ## encoding: [0xc3]
115115 %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
14751475 ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
14761476 ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
14771477 ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
1478 ; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3]
14781479 ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
1479 ; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb]
1480 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
1480 ; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
14811481 ; CHECK-NEXT: retq ## encoding: [0xc3]
14821482 %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
14831483 %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
14951495 ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
14961496 ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
14971497 ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
1498 ; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3]
14981499 ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
1499 ; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb]
1500 ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
1500 ; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
15011501 ; CHECK-NEXT: retq ## encoding: [0xc3]
15021502 %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
15031503 %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1)
15951595 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
15961596 ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
15971597 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
1598 ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
15981599 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
1599 ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
16001600 ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
16011601 ; CHECK-NEXT: retq ## encoding: [0xc3]
16021602 %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
16151615 ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
16161616 ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
16171617 ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
1618 ; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
16181619 ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
1619 ; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
16201620 ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
16211621 ; CHECK-NEXT: retq ## encoding: [0xc3]
16221622 %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
88 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2
99 ; CHECK-NEXT: kmovw %edi, %k1
1010 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
11 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1112 ; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
12 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1313 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1414 ; CHECK-NEXT: retq
1515 %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
66 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2
77 ; CHECK-NEXT: kmovw %edi, %k1
88 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
9 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
910 ; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z}
10 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
1111 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
1212 ; CHECK-NEXT: retq
1313 %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
3838 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2
3939 ; CHECK-NEXT: kmovw %edi, %k1
4040 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
41 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
4142 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
42 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
4343 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
4444 ; CHECK-NEXT: retq
4545 %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
403403 ; CHECK-NEXT: kmovw %edi, %k1
404404 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
405405 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
406 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
406407 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
407 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
408408 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
409409 ; CHECK-NEXT: retq
410410 %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3)
423423 ; CHECK-NEXT: kmovw %edi, %k1
424424 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
425425 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
426 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
426427 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
427 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
428428 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
429429 ; CHECK-NEXT: retq
430430 %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3)
15671567 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
15681568 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
15691569 ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
1570 ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
15701571 ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
1571 ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
15721572 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
15731573 ; CHECK-NEXT: retq ## encoding: [0xc3]
15741574 %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
15871587 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
15881588 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
15891589 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
1590 ; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3]
15901591 ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
1591 ; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
1592 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
1592 ; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2]
15931593 ; CHECK-NEXT: retq ## encoding: [0xc3]
15941594 %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
15951595 %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
16071607 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
16081608 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
16091609 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
1610 ; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
16101611 ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
1611 ; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
1612 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
1612 ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
16131613 ; CHECK-NEXT: retq ## encoding: [0xc3]
16141614 %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
16151615 %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
634634 ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
635635 ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
636636 ; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
637 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
637638 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
638 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
639639 ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
640640 ; CHECK-NEXT: retq ## encoding: [0xc3]
641641 %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3)
679679 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
680680 ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
681681 ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
682 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
682683 ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
683 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
684684 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
685685 ; CHECK-NEXT: retq ## encoding: [0xc3]
686686 %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3)
1212 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
1313 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
1414 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
15 ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
1516 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
16 ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
1717 ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
1818 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
1919 ; CHECK-NEXT: retq
4040 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
4141 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
4242 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
43 ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
4344 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
44 ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
4545 ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
4646 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
4747 ; CHECK-NEXT: retq
6868 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
6969 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
7070 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
71 ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
7172 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
72 ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
7373 ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
7474 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
7575 ; CHECK-NEXT: retq
9696 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
9797 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
9898 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
99 ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
99100 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
100 ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
101101 ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1
102102 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
103103 ; CHECK-NEXT: retq
1313 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
1414 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
1515 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
16 ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
1617 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
17 ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
1818 ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
1919 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
2020 ; CHECK-NEXT: retq
4141 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
4242 ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
4343 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
44 ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
4445 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
45 ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
4646 ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
4747 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
4848 ; CHECK-NEXT: retq
6969 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
7070 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
7171 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
72 ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
7273 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
73 ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
7474 ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
7575 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
7676 ; CHECK-NEXT: retq
9797 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
9898 ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
9999 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
100 ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
100101 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
101 ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
102102 ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
103103 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
104104 ; CHECK-NEXT: retq
125125 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
126126 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
127127 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
128 ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
128129 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
129 ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
130130 ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
131131 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
132132 ; CHECK-NEXT: retq
153153 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
154154 ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
155155 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
156 ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
156157 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
157 ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
158158 ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
159159 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
160160 ; CHECK-NEXT: retq
181181 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
182182 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
183183 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
184 ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
184185 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
185 ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
186186 ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1
187187 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
188188 ; CHECK-NEXT: retq
209209 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
210210 ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
211211 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
212 ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
212213 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
213 ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
214214 ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1
215215 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
216216 ; CHECK-NEXT: retq
2929 ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0]
3030 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3131 ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8]
32 ; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
3233 ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0]
33 ; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9]
3434 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
3535 ; CHECK-NEXT: retq ## encoding: [0xc3]
3636 %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
4949 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0]
5050 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
5151 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8]
52 ; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9]
5253 ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0]
53 ; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9]
5454 ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
5555 ; CHECK-NEXT: retq ## encoding: [0xc3]
5656 %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1)
6969 ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0]
7070 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
7171 ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8]
72 ; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9]
7273 ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0]
73 ; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9]
7474 ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
7575 ; CHECK-NEXT: retq ## encoding: [0xc3]
7676 %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1)
8989 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0]
9090 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
9191 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8]
92 ; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9]
9293 ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0]
93 ; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9]
9494 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
9595 ; CHECK-NEXT: retq ## encoding: [0xc3]
9696 %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
109109 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0]
110110 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
111111 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8]
112 ; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9]
112113 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0]
113 ; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9]
114114 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
115115 ; CHECK-NEXT: retq ## encoding: [0xc3]
116116 %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
129129 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0]
130130 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
131131 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8]
132 ; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9]
132133 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0]
133 ; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9]
134134 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
135135 ; CHECK-NEXT: retq ## encoding: [0xc3]
136136 %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
151151 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
152152 ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]
153153 ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2]
154 ; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
154155 ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]
155156 ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2]
156 ; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
157157 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
158158 ; CHECK-NEXT: retq ## encoding: [0xc3]
159159 %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
174174 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
175175 ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]
176176 ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
177 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
177178 ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]
178179 ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
179 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
180180 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
181181 ; CHECK-NEXT: retq ## encoding: [0xc3]
182182 %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
197197 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
198198 ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]
199199 ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3]
200 ; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
200201 ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]
201202 ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3]
202 ; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
203203 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
204204 ; CHECK-NEXT: retq ## encoding: [0xc3]
205205 %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
220220 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
221221 ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]
222222 ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
223 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
223224 ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]
224225 ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
225 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
226226 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
227227 ; CHECK-NEXT: retq ## encoding: [0xc3]
228228 %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
242242 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
243243 ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]
244244 ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0]
245 ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
245246 ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]
246247 ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0]
247 ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
248248 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
249249 ; CHECK-NEXT: retq ## encoding: [0xc3]
250250 %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2)
265265 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
266266 ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]
267267 ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2]
268 ; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
268269 ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]
269270 ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2]
270 ; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca]
271271 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
272272 ; CHECK-NEXT: retq ## encoding: [0xc3]
273273 %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2)
32083208 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
32093209 ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01]
32103210 ; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0]
3211 ; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3]
32113212 ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01]
32123213 ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0]
3213 ; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb]
3214 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1]
3214 ; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2]
32153215 ; CHECK-NEXT: retq ## encoding: [0xc3]
32163216 %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4)
32173217 %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1)
35393539 ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9]
35403540 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
35413541 ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1]
3542 ; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3]
35423543 ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1]
3543 ; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb]
3544 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
3544 ; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
35453545 ; CHECK-NEXT: retq ## encoding: [0xc3]
35463546 %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
35473547 %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
35593559 ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9]
35603560 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
35613561 ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1]
3562 ; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3]
35623563 ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1]
3563 ; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
3564 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
3564 ; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
35653565 ; CHECK-NEXT: retq ## encoding: [0xc3]
35663566 %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3)
35673567 %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1)
35793579 ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9]
35803580 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
35813581 ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1]
3582 ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
35823583 ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1]
3583 ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
3584 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
3584 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
35853585 ; CHECK-NEXT: retq ## encoding: [0xc3]
35863586 %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
35873587 %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
35993599 ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9]
36003600 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
36013601 ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1]
3602 ; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
36023603 ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1]
3603 ; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
3604 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
3604 ; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
36053605 ; CHECK-NEXT: retq ## encoding: [0xc3]
36063606 %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3)
36073607 %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1)
37193719 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03]
37203720 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
37213721 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03]
3722 ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
37223723 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03]
3723 ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
37243724 ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
37253725 ; CHECK-NEXT: retq ## encoding: [0xc3]
37263726 %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
37393739 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03]
37403740 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
37413741 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03]
3742 ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
37423743 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03]
3743 ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
37443744 ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
37453745 ; CHECK-NEXT: retq ## encoding: [0xc3]
37463746 %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
37593759 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03]
37603760 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
37613761 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03]
3762 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
37623763 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03]
3763 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
37643764 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
37653765 ; CHECK-NEXT: retq ## encoding: [0xc3]
37663766 %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
37793779 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03]
37803780 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
37813781 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03]
3782 ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
37823783 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03]
3783 ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
37843784 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
37853785 ; CHECK-NEXT: retq ## encoding: [0xc3]
37863786 %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
46414641 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
46424642 ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02]
46434643 ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1]
4644 ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3]
46444645 ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02]
46454646 ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1]
4646 ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
4647 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
4647 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
46484648 ; CHECK-NEXT: retq ## encoding: [0xc3]
46494649 %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4)
46504650 %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1)
48164816 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
48174817 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
48184818 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
4819 ; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3]
48194820 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01]
4820 ; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb]
4821 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
4821 ; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2]
48224822 ; CHECK-NEXT: retq ## encoding: [0xc3]
48234823 %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
48244824 %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
48364836 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
48374837 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
48384838 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
4839 ; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3]
48394840 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01]
4840 ; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
4841 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
4841 ; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
48424842 ; CHECK-NEXT: retq ## encoding: [0xc3]
48434843
48444844 %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
43674367 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
43684368 ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03]
43694369 ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03]
4370 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
43704371 ; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03]
4371 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
43724372 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
43734373 ; CHECK-NEXT: retq ## encoding: [0xc3]
43744374 %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
43874387 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
43884388 ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03]
43894389 ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03]
4390 ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
43904391 ; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03]
4391 ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
43924392 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
43934393 ; CHECK-NEXT: retq ## encoding: [0xc3]
43944394 %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
44074407 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
44084408 ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03]
44094409 ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03]
4410 ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
44104411 ; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03]
4411 ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
44124412 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
44134413 ; CHECK-NEXT: retq ## encoding: [0xc3]
44144414 %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
44274427 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
44284428 ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03]
44294429 ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03]
4430 ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
44304431 ; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03]
4431 ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
44324432 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
44334433 ; CHECK-NEXT: retq ## encoding: [0xc3]
44344434 %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
45274527 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
45284528 ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03]
45294529 ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03]
4530 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
45304531 ; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03]
4531 ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
45324532 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
45334533 ; CHECK-NEXT: retq ## encoding: [0xc3]
45344534 %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3)
45474547 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
45484548 ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03]
45494549 ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03]
4550 ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
45504551 ; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03]
4551 ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca]
45524552 ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
45534553 ; CHECK-NEXT: retq ## encoding: [0xc3]
45544554 %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3)
45674567 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
45684568 ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03]
45694569 ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03]
4570 ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
45704571 ; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03]
4571 ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca]
45724572 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
45734573 ; CHECK-NEXT: retq ## encoding: [0xc3]
45744574 %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3)
45874587 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
45884588 ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03]
45894589 ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03]
4590 ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
45904591 ; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03]
4591 ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca]
45924592 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
45934593 ; CHECK-NEXT: retq ## encoding: [0xc3]
45944594 %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3)
46894689 ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
46904690 ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
46914691 ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04]
4692 ; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc]
46924693 ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03]
4693 ; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc]
4694 ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
4694 ; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0]
46954695 ; CHECK-NEXT: retq ## encoding: [0xc3]
46964696 %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
46974697 %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
47314731 ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
47324732 ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
47334733 ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
4734 ; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc]
47344735 ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
4735 ; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc]
4736 ; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
4736 ; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
47374737 ; CHECK-NEXT: retq ## encoding: [0xc3]
47384738 %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
47394739 %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
47544754 ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
47554755 ; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8]
47564756 ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
4757 ; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd]
47574758 ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
4758 ; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd]
4759 ; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
4759 ; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
47604760 ; CHECK-NEXT: retq ## encoding: [0xc3]
47614761 %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
47624762 %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
404404 ; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
405405 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
406406 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
407 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
407408 ; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1
408 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
409 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
409410 ; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
410411 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
411 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
412 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
412413 ; AVX1-NEXT: vpsllq $56, %xmm0, %xmm0
413 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
414 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
414415 ; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
415416 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
416 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
417 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
417418 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
418 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
419 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
419 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
420420 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
421421 ; AVX1-NEXT: andl $1, %eax
422422 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
438438 ; AVX2-NEXT: vpsrad $24, %xmm2, %xmm2
439439 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
440440 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
441 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
441442 ; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1
442 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4
443 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
443444 ; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
444445 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
445 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
446 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
446447 ; AVX2-NEXT: vpsllq $56, %xmm0, %xmm0
447 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4
448 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
448449 ; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
449450 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
450 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
451 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
451452 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
452 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
453 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
453 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
454454 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
455455 ; AVX2-NEXT: andl $1, %eax
456456 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
558558 ; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
559559 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
560560 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
561 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
561562 ; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1
562 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
563 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
563564 ; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
564565 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
565 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
566 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
566567 ; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
567 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
568 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
568569 ; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
569570 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
570 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
571 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
571572 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
572 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
573 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
573 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
574574 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
575575 ; AVX1-NEXT: andl $1, %eax
576576 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
592592 ; AVX2-NEXT: vpsrad $16, %xmm2, %xmm2
593593 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
594594 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
595 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
595596 ; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
596 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4
597 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
597598 ; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
598599 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
599 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
600 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
600601 ; AVX2-NEXT: vpsllq $48, %xmm0, %xmm0
601 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4
602 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
602603 ; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0
603604 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
604 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
605 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
605606 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
606 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
607 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
607 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
608608 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
609609 ; AVX2-NEXT: andl $1, %eax
610610 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
702702 ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4
703703 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
704704 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
705 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
705706 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
706 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4
707 ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3
707708 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
708 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
709 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
709710 ; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
710 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4
711 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
711712 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
712 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
713 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
713714 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
714 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
715 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
715 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
716716 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
717717 ; AVX1-NEXT: andl $1, %eax
718718 ; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
732732 ; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4
733733 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
734734 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
735 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
735736 ; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
736 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm4
737 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3
737738 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
738 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
739 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3]
739740 ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0
740 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm4
741 ; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3
741742 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
742 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
743 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
743744 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
744 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
745 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
745 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
746746 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
747747 ; AVX2-NEXT: andl $1, %eax
748748 ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
891891 ; SSE2-SSSE3-NEXT: psrad $24, %xmm3
892892 ; SSE2-SSSE3-NEXT: pslld $24, %xmm2
893893 ; SSE2-SSSE3-NEXT: psrad $24, %xmm2
894 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
894895 ; SSE2-SSSE3-NEXT: pslld $24, %xmm1
895896 ; SSE2-SSSE3-NEXT: psrad $24, %xmm1
896897 ; SSE2-SSSE3-NEXT: pslld $24, %xmm0
897898 ; SSE2-SSSE3-NEXT: psrad $24, %xmm0
898899 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
899 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
900 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
901 ; SSE2-SSSE3-NEXT: movd %xmm2, %eax
902 ; SSE2-SSSE3-NEXT: andl $1, %eax
903 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
904 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
900 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
905901 ; SSE2-SSSE3-NEXT: movd %xmm0, %eax
906902 ; SSE2-SSSE3-NEXT: andl $1, %eax
907903 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
908 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
909 ; SSE2-SSSE3-NEXT: movd %xmm0, %eax
910 ; SSE2-SSSE3-NEXT: andl $1, %eax
911 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
912 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
904 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
905 ; SSE2-SSSE3-NEXT: movd %xmm1, %eax
906 ; SSE2-SSSE3-NEXT: andl $1, %eax
907 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
908 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
909 ; SSE2-SSSE3-NEXT: movd %xmm1, %eax
910 ; SSE2-SSSE3-NEXT: andl $1, %eax
911 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
912 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
913913 ; SSE2-SSSE3-NEXT: movd %xmm0, %eax
914914 ; SSE2-SSSE3-NEXT: andl $1, %eax
915915 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
922922 ; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3
923923 ; AVX12-NEXT: vpslld $24, %xmm2, %xmm2
924924 ; AVX12-NEXT: vpsrad $24, %xmm2, %xmm2
925 ; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
925926 ; AVX12-NEXT: vpslld $24, %xmm1, %xmm1
926927 ; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1
927928 ; AVX12-NEXT: vpslld $24, %xmm0, %xmm0
928929 ; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0
929930 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
930 ; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
931 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
931 ; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
932932 ; AVX12-NEXT: vpextrd $3, %xmm0, %eax
933933 ; AVX12-NEXT: andl $1, %eax
934934 ; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
974974 ; SSE2-SSSE3-NEXT: psrad $16, %xmm3
975975 ; SSE2-SSSE3-NEXT: pslld $16, %xmm2
976976 ; SSE2-SSSE3-NEXT: psrad $16, %xmm2
977 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
977978 ; SSE2-SSSE3-NEXT: pslld $16, %xmm1
978979 ; SSE2-SSSE3-NEXT: psrad $16, %xmm1
979980 ; SSE2-SSSE3-NEXT: pslld $16, %xmm0
980981 ; SSE2-SSSE3-NEXT: psrad $16, %xmm0
981982 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
982 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
983 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
984 ; SSE2-SSSE3-NEXT: movd %xmm2, %eax
985 ; SSE2-SSSE3-NEXT: andl $1, %eax
986 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
987 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
983 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
988984 ; SSE2-SSSE3-NEXT: movd %xmm0, %eax
989985 ; SSE2-SSSE3-NEXT: andl $1, %eax
990986 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
991 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
992 ; SSE2-SSSE3-NEXT: movd %xmm0, %eax
993 ; SSE2-SSSE3-NEXT: andl $1, %eax
994 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
995 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
987 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
988 ; SSE2-SSSE3-NEXT: movd %xmm1, %eax
989 ; SSE2-SSSE3-NEXT: andl $1, %eax
990 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
991 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
992 ; SSE2-SSSE3-NEXT: movd %xmm1, %eax
993 ; SSE2-SSSE3-NEXT: andl $1, %eax
994 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
995 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
996996 ; SSE2-SSSE3-NEXT: movd %xmm0, %eax
997997 ; SSE2-SSSE3-NEXT: andl $1, %eax
998998 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
10051005 ; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3
10061006 ; AVX12-NEXT: vpslld $16, %xmm2, %xmm2
10071007 ; AVX12-NEXT: vpsrad $16, %xmm2, %xmm2
1008 ; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
10081009 ; AVX12-NEXT: vpslld $16, %xmm1, %xmm1
10091010 ; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1
10101011 ; AVX12-NEXT: vpslld $16, %xmm0, %xmm0
10111012 ; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0
10121013 ; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
1013 ; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
1014 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
1014 ; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
10151015 ; AVX12-NEXT: vpextrd $3, %xmm0, %eax
10161016 ; AVX12-NEXT: andl $1, %eax
10171017 ; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
10571057 ; SSE2-SSSE3-NEXT: psraw $8, %xmm3
10581058 ; SSE2-SSSE3-NEXT: psllw $8, %xmm2
10591059 ; SSE2-SSSE3-NEXT: psraw $8, %xmm2
1060 ; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
10601061 ; SSE2-SSSE3-NEXT: psllw $8, %xmm1
10611062 ; SSE2-SSSE3-NEXT: psraw $8, %xmm1
10621063 ; SSE2-SSSE3-NEXT: psllw $8, %xmm0
10631064 ; SSE2-SSSE3-NEXT: psraw $8, %xmm0
10641065 ; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
1065 ; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
1066 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
1067 ; SSE2-SSSE3-NEXT: pextrw $7, %xmm2, %eax
1068 ; SSE2-SSSE3-NEXT: andl $1, %eax
1069 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1070 ; SSE2-SSSE3-NEXT: pextrw $6, %xmm2, %eax
1071 ; SSE2-SSSE3-NEXT: andl $1, %eax
1072 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1073 ; SSE2-SSSE3-NEXT: pextrw $5, %xmm2, %eax
1074 ; SSE2-SSSE3-NEXT: andl $1, %eax
1075 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1076 ; SSE2-SSSE3-NEXT: pextrw $4, %xmm2, %eax
1077 ; SSE2-SSSE3-NEXT: andl $1, %eax
1078 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1079 ; SSE2-SSSE3-NEXT: pextrw $3, %xmm2, %eax
1080 ; SSE2-SSSE3-NEXT: andl $1, %eax
1081 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1082 ; SSE2-SSSE3-NEXT: pextrw $2, %xmm2, %eax
1083 ; SSE2-SSSE3-NEXT: andl $1, %eax
1084 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1085 ; SSE2-SSSE3-NEXT: pextrw $1, %xmm2, %eax
1086 ; SSE2-SSSE3-NEXT: andl $1, %eax
1087 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1088 ; SSE2-SSSE3-NEXT: movd %xmm2, %eax
1066 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
1067 ; SSE2-SSSE3-NEXT: pextrw $7, %xmm0, %eax
1068 ; SSE2-SSSE3-NEXT: andl $1, %eax
1069 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1070 ; SSE2-SSSE3-NEXT: pextrw $6, %xmm0, %eax
1071 ; SSE2-SSSE3-NEXT: andl $1, %eax
1072 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1073 ; SSE2-SSSE3-NEXT: pextrw $5, %xmm0, %eax
1074 ; SSE2-SSSE3-NEXT: andl $1, %eax
1075 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1076 ; SSE2-SSSE3-NEXT: pextrw $4, %xmm0, %eax
1077 ; SSE2-SSSE3-NEXT: andl $1, %eax
1078 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1079 ; SSE2-SSSE3-NEXT: pextrw $3, %xmm0, %eax
1080 ; SSE2-SSSE3-NEXT: andl $1, %eax
1081 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1082 ; SSE2-SSSE3-NEXT: pextrw $2, %xmm0, %eax
1083 ; SSE2-SSSE3-NEXT: andl $1, %eax
1084 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1085 ; SSE2-SSSE3-NEXT: pextrw $1, %xmm0, %eax
1086 ; SSE2-SSSE3-NEXT: andl $1, %eax
1087 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
1088 ; SSE2-SSSE3-NEXT: movd %xmm0, %eax
10891089 ; SSE2-SSSE3-NEXT: andl $1, %eax
10901090 ; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
10911091 ; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
10971097 ; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3
10981098 ; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2
10991099 ; AVX12-NEXT: vpsraw $8, %xmm2, %xmm2
1100 ; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2
11001101 ; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1
11011102 ; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1
11021103 ; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0
11031104 ; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0
11041105 ; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
1105 ; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1
1106 ; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
1106 ; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
11071107 ; AVX12-NEXT: vpextrw $7, %xmm0, %eax
11081108 ; AVX12-NEXT: andl $1, %eax
11091109 ; AVX12-NEXT: movb %al, -{{[0-9]+}}(%rsp)
88 define i32 @test1(i32 %x) nounwind {
99 ; CHECK-LABEL: test1:
1010 ; CHECK: # BB#0:
11 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
12 ; CHECK-NEXT: movl %ecx, %edx
13 ; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000
14 ; CHECK-NEXT: movl %ecx, %eax
15 ; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000
16 ; CHECK-NEXT: shll $8, %edx
17 ; CHECK-NEXT: shrl $8, %eax
18 ; CHECK-NEXT: bswapl %ecx
19 ; CHECK-NEXT: shrl $16, %ecx
11 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
12 ; CHECK-NEXT: movl %eax, %ecx
13 ; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000
14 ; CHECK-NEXT: movl %eax, %edx
15 ; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000
16 ; CHECK-NEXT: shll $8, %ecx
17 ; CHECK-NEXT: shrl $8, %edx
18 ; CHECK-NEXT: orl %ecx, %edx
19 ; CHECK-NEXT: bswapl %eax
20 ; CHECK-NEXT: shrl $16, %eax
2021 ; CHECK-NEXT: orl %edx, %eax
21 ; CHECK-NEXT: orl %ecx, %eax
2222 ; CHECK-NEXT: retl
2323 ;
2424 ; CHECK64-LABEL: test1:
2525 ; CHECK64: # BB#0:
26 ; CHECK64-NEXT: movl %edi, %eax
27 ; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000
2628 ; CHECK64-NEXT: movl %edi, %ecx
27 ; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000
28 ; CHECK64-NEXT: movl %edi, %eax
29 ; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000
30 ; CHECK64-NEXT: shll $8, %ecx
31 ; CHECK64-NEXT: shrl $8, %eax
29 ; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000
30 ; CHECK64-NEXT: shll $8, %eax
31 ; CHECK64-NEXT: shrl $8, %ecx
32 ; CHECK64-NEXT: orl %eax, %ecx
3233 ; CHECK64-NEXT: bswapl %edi
3334 ; CHECK64-NEXT: shrl $16, %edi
34 ; CHECK64-NEXT: orl %ecx, %eax
35 ; CHECK64-NEXT: orl %edi, %eax
35 ; CHECK64-NEXT: orl %ecx, %edi
36 ; CHECK64-NEXT: movl %edi, %eax
3637 ; CHECK64-NEXT: retq
3738 %byte0 = and i32 %x, 255 ; 0x000000ff
3839 %byte1 = and i32 %x, 65280 ; 0x0000ff00
116116 ; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2
117117 ; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3
118118 ; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1
119 ; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
119120 ; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0
120121 ; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
121 ; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
122122 ; FMA3_256-NEXT: retq
123123 ;
124124 ; FMA3_512-LABEL: mul_subadd_pd512:
136136 ; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2
137137 ; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3
138138 ; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1
139 ; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
139140 ; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0
140141 ; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3]
141 ; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3]
142142 ; FMA4-NEXT: retq
143143 entry:
144144 %AB = fmul <8 x double> %A, %B
156156 ; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2
157157 ; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3
158158 ; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1
159 ; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
159160 ; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0
160161 ; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
161 ; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
162162 ; FMA3_256-NEXT: retq
163163 ;
164164 ; FMA3_512-LABEL: mul_subadd_ps512:
177177 ; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2
178178 ; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3
179179 ; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1
180 ; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
180181 ; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0
181182 ; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
182 ; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
183183 ; FMA4-NEXT: retq
184184 entry:
185185 %AB = fmul <16 x float> %A, %B