llvm.org GIT mirror llvm / ac9ca3b
Avoid false dependencies of undef machine operands This patch helps avoid false dependencies on undef registers by updating the machine instructions' undef operand to use a register that the instruction is truly dependent on, or use a register with clearance higher than Pref. Pseudo example: loop: xmm0 = ... xmm1 = vcvtsi2sdl eax, xmm0<undef> ... = inst xmm0 jmp loop In this example, selecting xmm0 as the undef register creates false dependency between loop iterations. This false dependency cannot be solved by inserting an xor before vcvtsi2sdl because xmm0 is alive at the point of the vcvtsi2sdl instruction. Selecting a different register instead of xmm0, especially a register that is not used in the loop, will eliminate this problem. Differential Revision: https://reviews.llvm.org/D22466 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278321 91177308-0d34-0410-b5e6-96231b3b80d8 Marina Yatsina 3 years ago
8 changed file(s) with 320 addition(s) and 243 deletion(s). Raw diff Collapse all Expand all
202202 void processDefs(MachineInstr*, bool Kill);
203203 void visitSoftInstr(MachineInstr*, unsigned mask);
204204 void visitHardInstr(MachineInstr*, unsigned domain);
205 void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
206 unsigned Pref);
205207 bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
206208 void processUndefReads(MachineBasicBlock*);
207209 };
472474 processDefs(MI, !DomP.first);
473475 }
474476
477 /// \brief Helps avoid false dependencies on undef registers by updating the
478 /// machine instructions' undef operand to use a register that the instruction
479 /// is truly dependent on, or use a register with clearance higher than Pref.
480 void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
481 unsigned Pref) {
482 MachineOperand &MO = MI->getOperand(OpIdx);
483 assert(MO.isUndef() && "Expected undef machine operand");
484
485 unsigned OriginalReg = MO.getReg();
486
487 // Update only undef operands that are mapped to one register.
488 if (AliasMap[OriginalReg].size() != 1)
489 return;
490
491 // Get the undef operand's register class
492 const TargetRegisterClass *OpRC =
493 TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF);
494
495 // If the instruction has a true dependency, we can hide the false depdency
496 // behind it.
497 for (MachineOperand &CurrMO : MI->operands()) {
498 if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() ||
499 !OpRC->contains(CurrMO.getReg()))
500 continue;
501 // We found a true dependency - replace the undef register with the true
502 // dependency.
503 MO.setReg(CurrMO.getReg());
504 return;
505 }
506
507 // Go over all registers in the register class and find the register with
508 // max clearance or clearance higher than Pref.
509 unsigned MaxClearance = 0;
510 unsigned MaxClearanceReg = OriginalReg;
511 for (unsigned rx = 0; rx < OpRC->getNumRegs(); ++rx) {
512 unsigned Clearance = CurInstr - LiveRegs[rx].Def;
513 if (Clearance <= MaxClearance)
514 continue;
515 MaxClearance = Clearance;
516 MaxClearanceReg = OpRC->getRegister(rx);
517
518 if (MaxClearance > Pref)
519 break;
520 }
521
522 // Update the operand if we found a register with better clearance.
523 if (MaxClearanceReg != OriginalReg)
524 MO.setReg(MaxClearanceReg);
525 }
526
475527 /// \brief Return true to if it makes sense to break dependence on a partial def
476528 /// or undef use.
477529 bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
509561 unsigned OpNum;
510562 unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
511563 if (Pref) {
564 pickBestRegisterForUndef(MI, OpNum, Pref);
512565 if (shouldBreakDependence(MI, OpNum, Pref))
513566 UndefReads.push_back(std::make_pair(MI, OpNum));
514567 }
6767 UndefRegClearance("undef-reg-clearance",
6868 cl::desc("How many idle instructions we would like before "
6969 "certain undef register reads"),
70 cl::init(64), cl::Hidden);
70 cl::init(128), cl::Hidden);
7171
7272 enum {
7373 // Select which memory operand is being unfolded.
1515 ; KNL: ## BB#0:
1616 ; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
1717 ; KNL-NEXT: vpextrq $1, %xmm1, %rax
18 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
18 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
1919 ; KNL-NEXT: vmovq %xmm1, %rax
20 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
20 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
2121 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2222 ; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
2323 ; KNL-NEXT: vpextrq $1, %xmm2, %rax
24 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
24 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
2525 ; KNL-NEXT: vmovq %xmm2, %rax
26 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
26 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
2727 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2828 ; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2929 ; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm2
3030 ; KNL-NEXT: vpextrq $1, %xmm2, %rax
31 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
31 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
3232 ; KNL-NEXT: vmovq %xmm2, %rax
33 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
33 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
3434 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
3535 ; KNL-NEXT: vpextrq $1, %xmm0, %rax
36 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
36 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
3737 ; KNL-NEXT: vmovq %xmm0, %rax
38 ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
39 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
38 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
4039 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
4140 ; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
4241 ; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
5554 ; KNL: ## BB#0:
5655 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
5756 ; KNL-NEXT: vpextrq $1, %xmm1, %rax
58 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
57 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
5958 ; KNL-NEXT: vmovq %xmm1, %rax
60 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
59 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
6160 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
6261 ; KNL-NEXT: vpextrq $1, %xmm0, %rax
63 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
62 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
6463 ; KNL-NEXT: vmovq %xmm0, %rax
65 ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
66 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
64 ; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
6765 ; KNL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
6866 ; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
6967 ; KNL-NEXT: retq
8078 ; KNL-LABEL: sltof2f32:
8179 ; KNL: ## BB#0:
8280 ; KNL-NEXT: vpextrq $1, %xmm0, %rax
83 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
81 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
8482 ; KNL-NEXT: vmovq %xmm0, %rax
85 ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
86 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
83 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
8784 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
88 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
85 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
8986 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
9087 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
9188 ; KNL-NEXT: retq
104101 ; KNL: ## BB#0:
105102 ; KNL-NEXT: vmovdqu (%rdi), %ymm0
106103 ; KNL-NEXT: vpextrq $1, %xmm0, %rax
107 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
104 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
108105 ; KNL-NEXT: vmovq %xmm0, %rax
109 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
106 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
110107 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
111108 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
112109 ; KNL-NEXT: vmovq %xmm0, %rax
113 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
110 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
114111 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
115112 ; KNL-NEXT: vpextrq $1, %xmm0, %rax
116 ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
117 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
113 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
118114 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
119115 ; KNL-NEXT: retq
120116 ;
185181 ; KNL-LABEL: sltof432:
186182 ; KNL: ## BB#0:
187183 ; KNL-NEXT: vpextrq $1, %xmm0, %rax
188 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
184 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
189185 ; KNL-NEXT: vmovq %xmm0, %rax
190 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
186 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
191187 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
192188 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
193189 ; KNL-NEXT: vmovq %xmm0, %rax
194 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
190 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
195191 ; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
196192 ; KNL-NEXT: vpextrq $1, %xmm0, %rax
197 ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
198 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
193 ; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
199194 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
200195 ; KNL-NEXT: retq
201196 ;
883878 ; KNL-NEXT: movl $-1, %eax
884879 ; KNL-NEXT: movl $0, %edx
885880 ; KNL-NEXT: cmovnel %eax, %edx
886 ; KNL-NEXT: vcvtsi2ssl %edx, %xmm0, %xmm1
881 ; KNL-NEXT: vcvtsi2ssl %edx, %xmm2, %xmm1
887882 ; KNL-NEXT: vmovq %xmm0, %rdx
888883 ; KNL-NEXT: testb $1, %dl
889884 ; KNL-NEXT: cmovnel %eax, %ecx
890 ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
891 ; KNL-NEXT: vcvtsi2ssl %ecx, %xmm0, %xmm0
885 ; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm0
892886 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
893887 ; KNL-NEXT: retq
894888 ;
10901084 ; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
10911085 ; KNL-NEXT: vpextrq $1, %xmm0, %rax
10921086 ; KNL-NEXT: andl $1, %eax
1093 ; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm1
1087 ; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
10941088 ; KNL-NEXT: vmovq %xmm0, %rax
10951089 ; KNL-NEXT: andl $1, %eax
1096 ; KNL-NEXT: vxorps %xmm0, %xmm0, %xmm0
1097 ; KNL-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0
1090 ; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
10981091 ; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
10991092 ; KNL-NEXT: retq
11001093 ;
125125 %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
126126 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
127127 %fi = sitofp i64 %i to double
128 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
128129 %vy = load double, double* %y
129130 %fipy = fadd double %fi, %vy
130131 %iipy = fptosi double %fipy to i64
173174 store double %mul11, double* %arrayidx13, align 8
174175 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
175176 %exitcond = icmp eq i64 %indvars.iv.next, 1024
177 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
176178 br i1 %exitcond, label %for.inc14, label %for.body3
177179
178180 for.inc14: ; preds = %for.body3
192194 ;SSE-NEXT: movsd [[XMM0]],
193195 ;AVX-LABEL:@loopdep3
194196 ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]]
195 ;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], [[XMM0]]
197 ;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}}
196198 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
197199 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
198200 ;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]]
201203
202204 define double @inlineasmdep(i64 %arg) {
203205 top:
204 tail call void asm sideeffect "", "~{xmm0},~{dirflag},~{fpsr},~{flags}"()
206 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
207 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
208 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
209 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
205210 %tmp1 = sitofp i64 %arg to double
206211 ret double %tmp1
207212 ;AVX-LABEL:@inlineasmdep
208213 ;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]]
209214 ;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}}
210215 }
216
217 ; Make sure we are making a smart choice regarding undef registers and
218 ; hiding the false dependency behind a true dependency
219 define double @truedeps(float %arg) {
220 top:
221 tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"()
222 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
223 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
224 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
225 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
226 %tmp1 = fpext float %arg to double
227 ret double %tmp1
228 ;AVX-LABEL:@truedeps
229 ;AVX-NOT: vxorps
230 ;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}}
231 }
232
233 ; Make sure we are making a smart choice regarding undef registers and
234 ; choosing the register with the highest clearence
235 define double @clearence(i64 %arg) {
236 top:
237 tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"()
238 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
239 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"()
240 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
241 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
242 %tmp1 = sitofp i64 %arg to double
243 ret double %tmp1
244 ;AVX-LABEL:@clearence
245 ;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]]
246 ;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}}
247 }
248
249 ; Make sure we are making a smart choice regarding undef registers in order to
250 ; avoid a cyclic dependence on a write to the same register in a previous
251 ; iteration, especially when we cannot zero out the undef register because it
252 ; is alive.
253 define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind {
254 entry:
255 %vx = load i64, i64* %x
256 br label %loop
257 loop:
258 %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
259 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
260 %fi = sitofp i64 %i to double
261 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"()
262 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
263 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
264 %vy = load double, double* %y
265 %fipy = fadd double %fi, %vy
266 %iipy = fptosi double %fipy to i64
267 %s2 = add i64 %s1, %iipy
268 %inc = add nsw i64 %i, 1
269 %exitcond = icmp eq i64 %inc, 156250000
270 br i1 %exitcond, label %ret, label %loop
271 ret:
272 ret i64 %s2
273 ;AVX-LABEL:@loopclearence
274 ;Registers 4-7 are not used and therefore one of them should be chosen
275 ;AVX-NOT: {{%xmm[4-7]}}
276 ;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}}
277 ;AVX-NOT: [[XMM4_7]]
278 }
2525 ; Copy the result in a temporary.
2626 ; Note: Technically the regalloc could have been smarter and this move not required,
2727 ; which would have hidden the bug.
28 ; CHECK-NEXT: vmovapd %xmm0, [[TMP:%xmm[0-9]+]]
28 ; CHECK: vmovapd %xmm0, [[TMP:%xmm[0-9]+]]
2929 ; Crush xmm0.
3030 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
3131 ; CHECK: movl $339772768, %e[[INDIRECT_CALL2:[a-z]+]]
3636 define double @foo(i64 %arg) {
3737 top:
3838 %tmp = call double inttoptr (i64 339752784 to double (double, double)*)(double 1.000000e+00, double 0.000000e+00)
39 tail call void asm sideeffect "", "x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(double %tmp)
3940 %tmp1 = sitofp i64 %arg to double
4041 call void inttoptr (i64 339772768 to void (double, double)*)(double %tmp, double %tmp1)
4142 %tmp3 = fadd double %tmp1, %tmp
298298 ; CHECK-F16C-NEXT: movswl (%rsi), %eax
299299 ; CHECK-F16C-NEXT: vmovd %eax, %xmm0
300300 ; CHECK-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
301 ; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm1
301 ; CHECK-F16C-NEXT: vcvtsi2ssl %edi, %xmm1, %xmm1
302302 ; CHECK-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
303303 ; CHECK-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
304304 ; CHECK-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0
3838 ; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
3939 ; AVX1-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2
4040 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax
41 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
41 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
4242 ; AVX1-NEXT: vmovq %xmm2, %rax
43 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
43 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
4444 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
4545 ; AVX1-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
4646 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
47 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
47 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1
4848 ; AVX1-NEXT: vmovq %xmm0, %rax
49 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
50 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
49 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
5150 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5251 ; AVX1-NEXT: vsubpd %xmm0, %xmm2, %xmm0
5352 ; AVX1-NEXT: vmovapd %xmm0, (%rdi)
5958 ; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
6059 ; AVX2-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2
6160 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax
62 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm3
61 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
6362 ; AVX2-NEXT: vmovq %xmm2, %rax
64 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
63 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
6564 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
6665 ; AVX2-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
6766 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
68 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
67 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm1
6968 ; AVX2-NEXT: vmovq %xmm0, %rax
70 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
71 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
69 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
7270 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
7371 ; AVX2-NEXT: vsubpd %xmm0, %xmm2, %xmm0
7472 ; AVX2-NEXT: vmovapd %xmm0, (%rdi)
2727 ; AVX-LABEL: sitofp_2i64_to_2f64:
2828 ; AVX: # BB#0:
2929 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
30 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
30 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
3131 ; AVX-NEXT: vmovq %xmm0, %rax
32 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
33 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
32 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
3433 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
3534 ; AVX-NEXT: retq
3635 %cvt = sitofp <2 x i64> %a to <2 x double>
208207 ; AVX1: # BB#0:
209208 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
210209 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
211 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
210 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
212211 ; AVX1-NEXT: vmovq %xmm1, %rax
213 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
212 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
214213 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
215214 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
216 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
215 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
217216 ; AVX1-NEXT: vmovq %xmm0, %rax
218 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
219 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
217 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
220218 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
221219 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
222220 ; AVX1-NEXT: retq
225223 ; AVX2: # BB#0:
226224 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
227225 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
228 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
226 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
229227 ; AVX2-NEXT: vmovq %xmm1, %rax
230 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
228 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
231229 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
232230 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
233 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
231 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
234232 ; AVX2-NEXT: vmovq %xmm0, %rax
235 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
236 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
233 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
237234 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
238235 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
239236 ; AVX2-NEXT: retq
242239 ; AVX512: # BB#0:
243240 ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm1
244241 ; AVX512-NEXT: vpextrq $1, %xmm1, %rax
245 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
242 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
246243 ; AVX512-NEXT: vmovq %xmm1, %rax
247 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
244 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
248245 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
249246 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
250 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
247 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
251248 ; AVX512-NEXT: vmovq %xmm0, %rax
252 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
253 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
249 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
254250 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
255251 ; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
256252 ; AVX512-NEXT: retq
940936 ; AVX-LABEL: sitofp_2i64_to_4f32:
941937 ; AVX: # BB#0:
942938 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
943 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
939 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
944940 ; AVX-NEXT: vmovq %xmm0, %rax
945 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
946 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
941 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
947942 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
948 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
943 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
949944 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
950945 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
951946 ; AVX-NEXT: retq
973968 ; AVX-LABEL: sitofp_4i64_to_4f32_undef:
974969 ; AVX: # BB#0:
975970 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
976 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
971 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
977972 ; AVX-NEXT: vmovq %xmm0, %rax
978 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
979 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
973 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
980974 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
981 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
975 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
982976 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
983977 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
984978 ; AVX-NEXT: retq
11391133 ; AVX1-LABEL: sitofp_4i64_to_4f32:
11401134 ; AVX1: # BB#0:
11411135 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1142 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1136 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
11431137 ; AVX1-NEXT: vmovq %xmm0, %rax
1144 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1138 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
11451139 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
11461140 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
11471141 ; AVX1-NEXT: vmovq %xmm0, %rax
1148 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1142 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
11491143 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
11501144 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1151 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1152 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1145 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
11531146 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
11541147 ; AVX1-NEXT: vzeroupper
11551148 ; AVX1-NEXT: retq
11571150 ; AVX2-LABEL: sitofp_4i64_to_4f32:
11581151 ; AVX2: # BB#0:
11591152 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1160 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1153 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
11611154 ; AVX2-NEXT: vmovq %xmm0, %rax
1162 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1155 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
11631156 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
11641157 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
11651158 ; AVX2-NEXT: vmovq %xmm0, %rax
1166 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1159 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
11671160 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
11681161 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1169 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1170 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1162 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
11711163 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
11721164 ; AVX2-NEXT: vzeroupper
11731165 ; AVX2-NEXT: retq
11751167 ; AVX512-LABEL: sitofp_4i64_to_4f32:
11761168 ; AVX512: # BB#0:
11771169 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
1178 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1170 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
11791171 ; AVX512-NEXT: vmovq %xmm0, %rax
1180 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1172 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
11811173 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
11821174 ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm0
11831175 ; AVX512-NEXT: vmovq %xmm0, %rax
1184 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1176 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
11851177 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
11861178 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
1187 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
1188 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1179 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
11891180 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
11901181 ; AVX512-NEXT: retq
11911182 %cvt = sitofp <4 x i64> %a to <4 x float>
13761367 ; VEX-NEXT: testq %rax, %rax
13771368 ; VEX-NEXT: js .LBB38_1
13781369 ; VEX-NEXT: # BB#2:
1379 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1370 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
13801371 ; VEX-NEXT: jmp .LBB38_3
13811372 ; VEX-NEXT: .LBB38_1:
13821373 ; VEX-NEXT: shrq %rax
13831374 ; VEX-NEXT: orq %rax, %rcx
1384 ; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1375 ; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
13851376 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
13861377 ; VEX-NEXT: .LBB38_3:
13871378 ; VEX-NEXT: vmovq %xmm0, %rax
13901381 ; VEX-NEXT: testq %rax, %rax
13911382 ; VEX-NEXT: js .LBB38_4
13921383 ; VEX-NEXT: # BB#5:
1393 ; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1394 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1384 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
13951385 ; VEX-NEXT: jmp .LBB38_6
13961386 ; VEX-NEXT: .LBB38_4:
13971387 ; VEX-NEXT: shrq %rax
13981388 ; VEX-NEXT: orq %rax, %rcx
1399 ; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1400 ; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1389 ; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0
14011390 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
14021391 ; VEX-NEXT: .LBB38_6:
14031392 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
14051394 ; VEX-NEXT: testq %rax, %rax
14061395 ; VEX-NEXT: js .LBB38_8
14071396 ; VEX-NEXT: # BB#7:
1408 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1397 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
14091398 ; VEX-NEXT: .LBB38_8:
14101399 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
14111400 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
14841473 ; VEX-NEXT: testq %rax, %rax
14851474 ; VEX-NEXT: js .LBB39_1
14861475 ; VEX-NEXT: # BB#2:
1487 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1476 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
14881477 ; VEX-NEXT: jmp .LBB39_3
14891478 ; VEX-NEXT: .LBB39_1:
14901479 ; VEX-NEXT: shrq %rax
14911480 ; VEX-NEXT: orq %rax, %rcx
1492 ; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1481 ; VEX-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
14931482 ; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
14941483 ; VEX-NEXT: .LBB39_3:
14951484 ; VEX-NEXT: vmovq %xmm0, %rax
14981487 ; VEX-NEXT: testq %rax, %rax
14991488 ; VEX-NEXT: js .LBB39_4
15001489 ; VEX-NEXT: # BB#5:
1501 ; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1502 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1490 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
15031491 ; VEX-NEXT: jmp .LBB39_6
15041492 ; VEX-NEXT: .LBB39_4:
15051493 ; VEX-NEXT: shrq %rax
15061494 ; VEX-NEXT: orq %rax, %rcx
1507 ; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1508 ; VEX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1495 ; VEX-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm0
15091496 ; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
15101497 ; VEX-NEXT: .LBB39_6:
15111498 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
15131500 ; VEX-NEXT: testq %rax, %rax
15141501 ; VEX-NEXT: js .LBB39_8
15151502 ; VEX-NEXT: # BB#7:
1516 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1503 ; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
15171504 ; VEX-NEXT: .LBB39_8:
15181505 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
15191506 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
17811768 ; AVX1-NEXT: testq %rax, %rax
17821769 ; AVX1-NEXT: js .LBB45_1
17831770 ; AVX1-NEXT: # BB#2:
1784 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1771 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
17851772 ; AVX1-NEXT: jmp .LBB45_3
17861773 ; AVX1-NEXT: .LBB45_1:
17871774 ; AVX1-NEXT: shrq %rax
17881775 ; AVX1-NEXT: orq %rax, %rcx
1789 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1776 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
17901777 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
17911778 ; AVX1-NEXT: .LBB45_3:
17921779 ; AVX1-NEXT: vmovq %xmm0, %rax
17951782 ; AVX1-NEXT: testq %rax, %rax
17961783 ; AVX1-NEXT: js .LBB45_4
17971784 ; AVX1-NEXT: # BB#5:
1798 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1785 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
17991786 ; AVX1-NEXT: jmp .LBB45_6
18001787 ; AVX1-NEXT: .LBB45_4:
18011788 ; AVX1-NEXT: shrq %rax
18021789 ; AVX1-NEXT: orq %rax, %rcx
1803 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1790 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
18041791 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
18051792 ; AVX1-NEXT: .LBB45_6:
18061793 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
18111798 ; AVX1-NEXT: testq %rax, %rax
18121799 ; AVX1-NEXT: js .LBB45_7
18131800 ; AVX1-NEXT: # BB#8:
1814 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1801 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
18151802 ; AVX1-NEXT: jmp .LBB45_9
18161803 ; AVX1-NEXT: .LBB45_7:
18171804 ; AVX1-NEXT: shrq %rax
18181805 ; AVX1-NEXT: orq %rax, %rcx
1819 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1806 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
18201807 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
18211808 ; AVX1-NEXT: .LBB45_9:
18221809 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
18261813 ; AVX1-NEXT: testq %rax, %rax
18271814 ; AVX1-NEXT: js .LBB45_10
18281815 ; AVX1-NEXT: # BB#11:
1829 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1830 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1816 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
18311817 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
18321818 ; AVX1-NEXT: vzeroupper
18331819 ; AVX1-NEXT: retq
18341820 ; AVX1-NEXT: .LBB45_10:
18351821 ; AVX1-NEXT: shrq %rax
18361822 ; AVX1-NEXT: orq %rax, %rcx
1837 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1838 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1823 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
18391824 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
18401825 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
18411826 ; AVX1-NEXT: vzeroupper
18491834 ; AVX2-NEXT: testq %rax, %rax
18501835 ; AVX2-NEXT: js .LBB45_1
18511836 ; AVX2-NEXT: # BB#2:
1852 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1837 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
18531838 ; AVX2-NEXT: jmp .LBB45_3
18541839 ; AVX2-NEXT: .LBB45_1:
18551840 ; AVX2-NEXT: shrq %rax
18561841 ; AVX2-NEXT: orq %rax, %rcx
1857 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1842 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
18581843 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
18591844 ; AVX2-NEXT: .LBB45_3:
18601845 ; AVX2-NEXT: vmovq %xmm0, %rax
18631848 ; AVX2-NEXT: testq %rax, %rax
18641849 ; AVX2-NEXT: js .LBB45_4
18651850 ; AVX2-NEXT: # BB#5:
1866 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1851 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
18671852 ; AVX2-NEXT: jmp .LBB45_6
18681853 ; AVX2-NEXT: .LBB45_4:
18691854 ; AVX2-NEXT: shrq %rax
18701855 ; AVX2-NEXT: orq %rax, %rcx
1871 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1856 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
18721857 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
18731858 ; AVX2-NEXT: .LBB45_6:
18741859 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
18791864 ; AVX2-NEXT: testq %rax, %rax
18801865 ; AVX2-NEXT: js .LBB45_7
18811866 ; AVX2-NEXT: # BB#8:
1882 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1867 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
18831868 ; AVX2-NEXT: jmp .LBB45_9
18841869 ; AVX2-NEXT: .LBB45_7:
18851870 ; AVX2-NEXT: shrq %rax
18861871 ; AVX2-NEXT: orq %rax, %rcx
1887 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1872 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
18881873 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
18891874 ; AVX2-NEXT: .LBB45_9:
18901875 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
18941879 ; AVX2-NEXT: testq %rax, %rax
18951880 ; AVX2-NEXT: js .LBB45_10
18961881 ; AVX2-NEXT: # BB#11:
1897 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1898 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1882 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
18991883 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
19001884 ; AVX2-NEXT: vzeroupper
19011885 ; AVX2-NEXT: retq
19021886 ; AVX2-NEXT: .LBB45_10:
19031887 ; AVX2-NEXT: shrq %rax
19041888 ; AVX2-NEXT: orq %rax, %rcx
1905 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1906 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1889 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
19071890 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
19081891 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
19091892 ; AVX2-NEXT: vzeroupper
21172100 ; VEX: # BB#0:
21182101 ; VEX-NEXT: vmovdqa (%rdi), %xmm0
21192102 ; VEX-NEXT: vpextrq $1, %xmm0, %rax
2120 ; VEX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
2103 ; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
21212104 ; VEX-NEXT: vmovq %xmm0, %rax
2122 ; VEX-NEXT: vxorps %xmm0, %xmm0, %xmm0
2123 ; VEX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
2105 ; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
21242106 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
21252107 ; VEX-NEXT: retq
21262108 ;
21282110 ; AVX512: # BB#0:
21292111 ; AVX512-NEXT: vmovdqa64 (%rdi), %xmm0
21302112 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
2131 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
2113 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
21322114 ; AVX512-NEXT: vmovq %xmm0, %rax
2133 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
2134 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
2115 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
21352116 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
21362117 ; AVX512-NEXT: retq
21372118 %ld = load <2 x i64>, <2 x i64> *%a
22302211 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
22312212 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
22322213 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
2233 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
2214 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
22342215 ; AVX1-NEXT: vmovq %xmm1, %rax
2235 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
2216 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
22362217 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
22372218 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2238 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
2219 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
22392220 ; AVX1-NEXT: vmovq %xmm0, %rax
2240 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
2241 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
2221 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
22422222 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
22432223 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
22442224 ; AVX1-NEXT: retq
22482228 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
22492229 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
22502230 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
2251 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
2231 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
22522232 ; AVX2-NEXT: vmovq %xmm1, %rax
2253 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
2233 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
22542234 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
22552235 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2256 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
2236 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
22572237 ; AVX2-NEXT: vmovq %xmm0, %rax
2258 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
2259 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
2238 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
22602239 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
22612240 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
22622241 ; AVX2-NEXT: retq
22662245 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm0
22672246 ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm1
22682247 ; AVX512-NEXT: vpextrq $1, %xmm1, %rax
2269 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
2248 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
22702249 ; AVX512-NEXT: vmovq %xmm1, %rax
2271 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
2250 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
22722251 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
22732252 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
2274 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
2253 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
22752254 ; AVX512-NEXT: vmovq %xmm0, %rax
2276 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
2277 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
2255 ; AVX512-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
22782256 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
22792257 ; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
22802258 ; AVX512-NEXT: retq
27552733 ; AVX1: # BB#0:
27562734 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
27572735 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2758 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2736 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
27592737 ; AVX1-NEXT: vmovq %xmm0, %rax
2760 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2738 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
27612739 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
27622740 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
27632741 ; AVX1-NEXT: vmovq %xmm0, %rax
2764 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2742 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
27652743 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
27662744 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2767 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
2768 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2745 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
27692746 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
27702747 ; AVX1-NEXT: vzeroupper
27712748 ; AVX1-NEXT: retq
27742751 ; AVX2: # BB#0:
27752752 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
27762753 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2777 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2754 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
27782755 ; AVX2-NEXT: vmovq %xmm0, %rax
2779 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2756 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
27802757 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
27812758 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
27822759 ; AVX2-NEXT: vmovq %xmm0, %rax
2783 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2760 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
27842761 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
27852762 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2786 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
2787 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2763 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
27882764 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
27892765 ; AVX2-NEXT: vzeroupper
27902766 ; AVX2-NEXT: retq
27932769 ; AVX512: # BB#0:
27942770 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm0
27952771 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
2796 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2772 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
27972773 ; AVX512-NEXT: vmovq %xmm0, %rax
2798 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2774 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
27992775 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
28002776 ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm0
28012777 ; AVX512-NEXT: vmovq %xmm0, %rax
2802 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2778 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
28032779 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
28042780 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
2805 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
2806 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2781 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
28072782 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
28082783 ; AVX512-NEXT: retq
28092784 %ld = load <4 x i64>, <4 x i64> *%a
29112886 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
29122887 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
29132888 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
2914 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2889 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
29152890 ; AVX1-NEXT: vmovq %xmm1, %rax
2916 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2891 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
29172892 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
29182893 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
29192894 ; AVX1-NEXT: vmovq %xmm1, %rax
2920 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2895 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
29212896 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
29222897 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
2923 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2898 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1
29242899 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
29252900 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2926 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2901 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
29272902 ; AVX1-NEXT: vmovq %xmm0, %rax
2928 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2903 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
29292904 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
29302905 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
29312906 ; AVX1-NEXT: vmovq %xmm0, %rax
2932 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2907 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
29332908 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
29342909 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2935 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
2936 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2910 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
29372911 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
29382912 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
29392913 ; AVX1-NEXT: retq
29432917 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
29442918 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
29452919 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
2946 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2920 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
29472921 ; AVX2-NEXT: vmovq %xmm1, %rax
2948 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2922 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
29492923 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
29502924 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
29512925 ; AVX2-NEXT: vmovq %xmm1, %rax
2952 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2926 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
29532927 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
29542928 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
2955 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2929 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1
29562930 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
29572931 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2958 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2932 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
29592933 ; AVX2-NEXT: vmovq %xmm0, %rax
2960 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2934 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
29612935 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
29622936 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
29632937 ; AVX2-NEXT: vmovq %xmm0, %rax
2964 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2938 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
29652939 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
29662940 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2967 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
2968 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2941 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
29692942 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
29702943 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
29712944 ; AVX2-NEXT: retq
29752948 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
29762949 ; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
29772950 ; AVX512-NEXT: vpextrq $1, %xmm1, %rax
2978 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2951 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
29792952 ; AVX512-NEXT: vmovq %xmm1, %rax
2980 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2953 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
29812954 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
29822955 ; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm2
29832956 ; AVX512-NEXT: vmovq %xmm2, %rax
2984 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2957 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
29852958 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
29862959 ; AVX512-NEXT: vpextrq $1, %xmm2, %rax
2987 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2960 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
29882961 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
29892962 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
2990 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2963 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
29912964 ; AVX512-NEXT: vmovq %xmm0, %rax
2992 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2965 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
29932966 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
29942967 ; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm0
29952968 ; AVX512-NEXT: vmovq %xmm0, %rax
2996 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2969 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
29972970 ; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
29982971 ; AVX512-NEXT: vpextrq $1, %xmm0, %rax
2999 ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
3000 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2972 ; AVX512-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
30012973 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
30022974 ; AVX512-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
30032975 ; AVX512-NEXT: retq
31853157 ; AVX1-NEXT: testq %rax, %rax
31863158 ; AVX1-NEXT: js .LBB74_1
31873159 ; AVX1-NEXT: # BB#2:
3188 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
3160 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
31893161 ; AVX1-NEXT: jmp .LBB74_3
31903162 ; AVX1-NEXT: .LBB74_1:
31913163 ; AVX1-NEXT: shrq %rax
31923164 ; AVX1-NEXT: orq %rax, %rcx
3193 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
3165 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
31943166 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
31953167 ; AVX1-NEXT: .LBB74_3:
31963168 ; AVX1-NEXT: vmovq %xmm0, %rax
31993171 ; AVX1-NEXT: testq %rax, %rax
32003172 ; AVX1-NEXT: js .LBB74_4
32013173 ; AVX1-NEXT: # BB#5:
3202 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3174 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
32033175 ; AVX1-NEXT: jmp .LBB74_6
32043176 ; AVX1-NEXT: .LBB74_4:
32053177 ; AVX1-NEXT: shrq %rax
32063178 ; AVX1-NEXT: orq %rax, %rcx
3207 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3179 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
32083180 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
32093181 ; AVX1-NEXT: .LBB74_6:
32103182 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
32153187 ; AVX1-NEXT: testq %rax, %rax
32163188 ; AVX1-NEXT: js .LBB74_7
32173189 ; AVX1-NEXT: # BB#8:
3218 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3190 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
32193191 ; AVX1-NEXT: jmp .LBB74_9
32203192 ; AVX1-NEXT: .LBB74_7:
32213193 ; AVX1-NEXT: shrq %rax
32223194 ; AVX1-NEXT: orq %rax, %rcx
3223 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3195 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
32243196 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
32253197 ; AVX1-NEXT: .LBB74_9:
32263198 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
32303202 ; AVX1-NEXT: testq %rax, %rax
32313203 ; AVX1-NEXT: js .LBB74_10
32323204 ; AVX1-NEXT: # BB#11:
3233 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3234 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
3205 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
32353206 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
32363207 ; AVX1-NEXT: vzeroupper
32373208 ; AVX1-NEXT: retq
32383209 ; AVX1-NEXT: .LBB74_10:
32393210 ; AVX1-NEXT: shrq %rax
32403211 ; AVX1-NEXT: orq %rax, %rcx
3241 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3242 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
3212 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
32433213 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
32443214 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
32453215 ; AVX1-NEXT: vzeroupper
32543224 ; AVX2-NEXT: testq %rax, %rax
32553225 ; AVX2-NEXT: js .LBB74_1
32563226 ; AVX2-NEXT: # BB#2:
3257 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
3227 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
32583228 ; AVX2-NEXT: jmp .LBB74_3
32593229 ; AVX2-NEXT: .LBB74_1:
32603230 ; AVX2-NEXT: shrq %rax
32613231 ; AVX2-NEXT: orq %rax, %rcx
3262 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
3232 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
32633233 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
32643234 ; AVX2-NEXT: .LBB74_3:
32653235 ; AVX2-NEXT: vmovq %xmm0, %rax
32683238 ; AVX2-NEXT: testq %rax, %rax
32693239 ; AVX2-NEXT: js .LBB74_4
32703240 ; AVX2-NEXT: # BB#5:
3271 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3241 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
32723242 ; AVX2-NEXT: jmp .LBB74_6
32733243 ; AVX2-NEXT: .LBB74_4:
32743244 ; AVX2-NEXT: shrq %rax
32753245 ; AVX2-NEXT: orq %rax, %rcx
3276 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3246 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm2, %xmm2
32773247 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
32783248 ; AVX2-NEXT: .LBB74_6:
32793249 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
32843254 ; AVX2-NEXT: testq %rax, %rax
32853255 ; AVX2-NEXT: js .LBB74_7
32863256 ; AVX2-NEXT: # BB#8:
3287 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3257 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
32883258 ; AVX2-NEXT: jmp .LBB74_9
32893259 ; AVX2-NEXT: .LBB74_7:
32903260 ; AVX2-NEXT: shrq %rax
32913261 ; AVX2-NEXT: orq %rax, %rcx
3292 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3262 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm2
32933263 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
32943264 ; AVX2-NEXT: .LBB74_9:
32953265 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
32993269 ; AVX2-NEXT: testq %rax, %rax
33003270 ; AVX2-NEXT: js .LBB74_10
33013271 ; AVX2-NEXT: # BB#11:
3302 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
3303 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
3272 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
33043273 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
33053274 ; AVX2-NEXT: vzeroupper
33063275 ; AVX2-NEXT: retq
33073276 ; AVX2-NEXT: .LBB74_10:
33083277 ; AVX2-NEXT: shrq %rax
33093278 ; AVX2-NEXT: orq %rax, %rcx
3310 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
3311 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
3279 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm0
33123280 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
33133281 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
33143282 ; AVX2-NEXT: vzeroupper
35803548 ; AVX1-NEXT: testq %rax, %rax
35813549 ; AVX1-NEXT: js .LBB78_1
35823550 ; AVX1-NEXT: # BB#2:
3583 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
3551 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
35843552 ; AVX1-NEXT: jmp .LBB78_3
35853553 ; AVX1-NEXT: .LBB78_1:
35863554 ; AVX1-NEXT: shrq %rax
35873555 ; AVX1-NEXT: orq %rax, %rcx
3588 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
3556 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
35893557 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
35903558 ; AVX1-NEXT: .LBB78_3:
35913559 ; AVX1-NEXT: vmovq %xmm2, %rax
35943562 ; AVX1-NEXT: testq %rax, %rax
35953563 ; AVX1-NEXT: js .LBB78_4
35963564 ; AVX1-NEXT: # BB#5:
3597 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
3565 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
35983566 ; AVX1-NEXT: jmp .LBB78_6
35993567 ; AVX1-NEXT: .LBB78_4:
36003568 ; AVX1-NEXT: shrq %rax
36013569 ; AVX1-NEXT: orq %rax, %rcx
3602 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
3570 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3
36033571 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
36043572 ; AVX1-NEXT: .LBB78_6:
36053573 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
36093577 ; AVX1-NEXT: testq %rax, %rax
36103578 ; AVX1-NEXT: js .LBB78_7
36113579 ; AVX1-NEXT: # BB#8:
3612 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
3580 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
36133581 ; AVX1-NEXT: jmp .LBB78_9
36143582 ; AVX1-NEXT: .LBB78_7:
36153583 ; AVX1-NEXT: shrq %rax
36163584 ; AVX1-NEXT: orq %rax, %rcx
3617 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
3585 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4
36183586 ; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
36193587 ; AVX1-NEXT: .LBB78_9:
36203588 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax
36233591 ; AVX1-NEXT: testq %rax, %rax
36243592 ; AVX1-NEXT: js .LBB78_10
36253593 ; AVX1-NEXT: # BB#11:
3626 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3594 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
36273595 ; AVX1-NEXT: jmp .LBB78_12
36283596 ; AVX1-NEXT: .LBB78_10:
36293597 ; AVX1-NEXT: shrq %rax
36303598 ; AVX1-NEXT: orq %rax, %rcx
3631 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3599 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2
36323600 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
36333601 ; AVX1-NEXT: .LBB78_12:
36343602 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
36373605 ; AVX1-NEXT: testq %rax, %rax
36383606 ; AVX1-NEXT: js .LBB78_13
36393607 ; AVX1-NEXT: # BB#14:
3640 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
3608 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
36413609 ; AVX1-NEXT: jmp .LBB78_15
36423610 ; AVX1-NEXT: .LBB78_13:
36433611 ; AVX1-NEXT: shrq %rax
36443612 ; AVX1-NEXT: orq %rax, %rcx
3645 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
3613 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5
36463614 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
36473615 ; AVX1-NEXT: .LBB78_15:
36483616 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
36523620 ; AVX1-NEXT: testq %rax, %rax
36533621 ; AVX1-NEXT: js .LBB78_16
36543622 ; AVX1-NEXT: # BB#17:
3655 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
3623 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
36563624 ; AVX1-NEXT: jmp .LBB78_18
36573625 ; AVX1-NEXT: .LBB78_16:
36583626 ; AVX1-NEXT: shrq %rax
36593627 ; AVX1-NEXT: orq %rax, %rcx
3660 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
3628 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3
36613629 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
36623630 ; AVX1-NEXT: .LBB78_18:
36633631 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
36693637 ; AVX1-NEXT: testq %rax, %rax
36703638 ; AVX1-NEXT: js .LBB78_19
36713639 ; AVX1-NEXT: # BB#20:
3672 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3673 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
3640 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
36743641 ; AVX1-NEXT: jmp .LBB78_21
36753642 ; AVX1-NEXT: .LBB78_19:
36763643 ; AVX1-NEXT: shrq %rax
36773644 ; AVX1-NEXT: orq %rax, %rcx
3678 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3679 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
3645 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0
36803646 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
36813647 ; AVX1-NEXT: .LBB78_21:
36823648 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
36873653 ; AVX1-NEXT: testq %rax, %rax
36883654 ; AVX1-NEXT: js .LBB78_22
36893655 ; AVX1-NEXT: # BB#23:
3690 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3656 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
36913657 ; AVX1-NEXT: jmp .LBB78_24
36923658 ; AVX1-NEXT: .LBB78_22:
36933659 ; AVX1-NEXT: shrq %rax
36943660 ; AVX1-NEXT: orq %rax, %rcx
3695 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3661 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2
36963662 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
36973663 ; AVX1-NEXT: .LBB78_24:
36983664 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
37093675 ; AVX2-NEXT: testq %rax, %rax
37103676 ; AVX2-NEXT: js .LBB78_1
37113677 ; AVX2-NEXT: # BB#2:
3712 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
3678 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
37133679 ; AVX2-NEXT: jmp .LBB78_3
37143680 ; AVX2-NEXT: .LBB78_1:
37153681 ; AVX2-NEXT: shrq %rax
37163682 ; AVX2-NEXT: orq %rax, %rcx
3717 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
3683 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm1, %xmm1
37183684 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
37193685 ; AVX2-NEXT: .LBB78_3:
37203686 ; AVX2-NEXT: vmovq %xmm2, %rax
37233689 ; AVX2-NEXT: testq %rax, %rax
37243690 ; AVX2-NEXT: js .LBB78_4
37253691 ; AVX2-NEXT: # BB#5:
3726 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
3692 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
37273693 ; AVX2-NEXT: jmp .LBB78_6
37283694 ; AVX2-NEXT: .LBB78_4:
37293695 ; AVX2-NEXT: shrq %rax
37303696 ; AVX2-NEXT: orq %rax, %rcx
3731 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
3697 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm3, %xmm3
37323698 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
37333699 ; AVX2-NEXT: .LBB78_6:
37343700 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
37383704 ; AVX2-NEXT: testq %rax, %rax
37393705 ; AVX2-NEXT: js .LBB78_7
37403706 ; AVX2-NEXT: # BB#8:
3741 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
3707 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
37423708 ; AVX2-NEXT: jmp .LBB78_9
37433709 ; AVX2-NEXT: .LBB78_7:
37443710 ; AVX2-NEXT: shrq %rax
37453711 ; AVX2-NEXT: orq %rax, %rcx
3746 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
3712 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm4, %xmm4
37473713 ; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
37483714 ; AVX2-NEXT: .LBB78_9:
37493715 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax
37523718 ; AVX2-NEXT: testq %rax, %rax
37533719 ; AVX2-NEXT: js .LBB78_10
37543720 ; AVX2-NEXT: # BB#11:
3755 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3721 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
37563722 ; AVX2-NEXT: jmp .LBB78_12
37573723 ; AVX2-NEXT: .LBB78_10:
37583724 ; AVX2-NEXT: shrq %rax
37593725 ; AVX2-NEXT: orq %rax, %rcx
3760 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3726 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm2
37613727 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
37623728 ; AVX2-NEXT: .LBB78_12:
37633729 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
37663732 ; AVX2-NEXT: testq %rax, %rax
37673733 ; AVX2-NEXT: js .LBB78_13
37683734 ; AVX2-NEXT: # BB#14:
3769 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
3735 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
37703736 ; AVX2-NEXT: jmp .LBB78_15
37713737 ; AVX2-NEXT: .LBB78_13:
37723738 ; AVX2-NEXT: shrq %rax
37733739 ; AVX2-NEXT: orq %rax, %rcx
3774 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
3740 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm5, %xmm5
37753741 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
37763742 ; AVX2-NEXT: .LBB78_15:
37773743 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
37813747 ; AVX2-NEXT: testq %rax, %rax
37823748 ; AVX2-NEXT: js .LBB78_16
37833749 ; AVX2-NEXT: # BB#17:
3784 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
3750 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
37853751 ; AVX2-NEXT: jmp .LBB78_18
37863752 ; AVX2-NEXT: .LBB78_16:
37873753 ; AVX2-NEXT: shrq %rax
37883754 ; AVX2-NEXT: orq %rax, %rcx
3789 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
3755 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm3
37903756 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
37913757 ; AVX2-NEXT: .LBB78_18:
37923758 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
37983764 ; AVX2-NEXT: testq %rax, %rax
37993765 ; AVX2-NEXT: js .LBB78_19
38003766 ; AVX2-NEXT: # BB#20:
3801 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
3802 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
3767 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
38033768 ; AVX2-NEXT: jmp .LBB78_21
38043769 ; AVX2-NEXT: .LBB78_19:
38053770 ; AVX2-NEXT: shrq %rax
38063771 ; AVX2-NEXT: orq %rax, %rcx
3807 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
3808 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
3772 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm0
38093773 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
38103774 ; AVX2-NEXT: .LBB78_21:
38113775 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
38163780 ; AVX2-NEXT: testq %rax, %rax
38173781 ; AVX2-NEXT: js .LBB78_22
38183782 ; AVX2-NEXT: # BB#23:
3819 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3783 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
38203784 ; AVX2-NEXT: jmp .LBB78_24
38213785 ; AVX2-NEXT: .LBB78_22:
38223786 ; AVX2-NEXT: shrq %rax
38233787 ; AVX2-NEXT: orq %rax, %rcx
3824 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3788 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm6, %xmm2
38253789 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
38263790 ; AVX2-NEXT: .LBB78_24:
38273791 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]