llvm.org GIT mirror llvm / dc4f56d
[X86][SSE]] Lower BUILD_VECTOR with repeated elts as BUILD_VECTOR + VECTOR_SHUFFLE It can be costly to transfer from the gprs to the xmm registers and can prevent loads merging. This patch splits vXi16/vXi32/vXi64 BUILD_VECTORS that use the same operand in multiple elements into a BUILD_VECTOR with only a single insertion of each of those elements and then performs an unary shuffle to duplicate the values. There are a couple of minor regressions this patch unearths due to some missing MOVDDUP/BROADCAST folds that I will address in a future patch. Note: Now that vector shuffle lowering and combining is pretty good we should be reusing that instead of duplicating so much in LowerBUILD_VECTOR - this is the first of several patches to address this. Differential Revision: https://reviews.llvm.org/D31373 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@299387 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
12 changed file(s) with 144 addition(s) and 105 deletion(s). Raw diff Collapse all Expand all
61196119 return SDValue();
61206120 }
61216121
6122 // Attempt to lower a build vector of repeated elts as a build vector of unique
6123 // ops followed by a shuffle.
6124 static SDValue
6125 lowerBuildVectorWithRepeatedEltsUsingShuffle(SDValue V, SelectionDAG &DAG,
6126 const X86Subtarget &Subtarget) {
6127 MVT VT = V.getSimpleValueType();
6128 unsigned NumElts = VT.getVectorNumElements();
6129
6130 // TODO - vXi8 insertions+shuffles often cause PSHUFBs which can lead to
6131 // excessive/bulky shuffle mask creation.
6132 if (VT.getScalarSizeInBits() < 16)
6133 return SDValue();
6134
6135 // Create list of unique operands to be passed to a build vector and a shuffle
6136 // mask describing the repetitions.
6137 // TODO - we currently insert the first occurances in place - sometimes it
6138 // might be better to insert them in other locations for shuffle efficiency.
6139 bool HasRepeatedElts = false;
6140 SmallVector Mask(NumElts, SM_SentinelUndef);
6141 SmallVector Uniques(V->op_begin(), V->op_end());
6142 for (unsigned i = 0; i != NumElts; ++i) {
6143 SDValue Op = Uniques[i];
6144 if (Op.isUndef())
6145 continue;
6146 Mask[i] = i;
6147
6148 // Zeros can be efficiently repeated, so don't shuffle these.
6149 if (X86::isZeroNode(Op))
6150 continue;
6151
6152 // If any repeated operands are found then mark the build vector entry as
6153 // undef and setup a copy in the shuffle mask.
6154 for (unsigned j = i + 1; j != NumElts; ++j)
6155 if (Op == Uniques[j]) {
6156 HasRepeatedElts = true;
6157 Mask[j] = i;
6158 Uniques[j] = DAG.getUNDEF(VT.getScalarType());
6159 }
6160 }
6161
6162 if (!HasRepeatedElts)
6163 return SDValue();
6164
6165 SDLoc DL(V);
6166 return DAG.getVectorShuffle(VT, DL, DAG.getBuildVector(VT, DL, Uniques),
6167 DAG.getUNDEF(VT), Mask);
6168 }
6169
61226170 /// Custom lower build_vector of v16i8.
61236171 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
61246172 unsigned NumNonZero, unsigned NumZero,
77517799 if (IsAllConstants)
77527800 return SDValue();
77537801
7754 // See if we can use a vector load to get all of the elements.
77557802 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7803 // See if we can use a vector load to get all of the elements.
77567804 SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElems);
77577805 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
77587806 return LD;
7807
7808 // Attempt to lower a build vector of repeated elts as single insertions
7809 // followed by a shuffle.
7810 if (SDValue V =
7811 lowerBuildVectorWithRepeatedEltsUsingShuffle(Op, DAG, Subtarget))
7812 return V;
77597813 }
77607814
77617815 // For AVX-length vectors, build the individual 128-bit pieces and use
24242424 define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
24252425 ; X32-LABEL: test_mm256_set1_epi64x:
24262426 ; X32: # BB#0:
2427 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
2428 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2429 ; X32-NEXT: vmovd %ecx, %xmm0
2430 ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
2431 ; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
2432 ; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
2427 ; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2428 ; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2429 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
24332430 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
24342431 ; X32-NEXT: retl
24352432 ;
55 ; X32-LABEL: A:
66 ; X32: ## BB#0: ## %entry
77 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
8 ; X32-NEXT: movl (%eax), %ecx
9 ; X32-NEXT: movl 4(%eax), %eax
10 ; X32-NEXT: vmovd %ecx, %xmm0
11 ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
12 ; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
13 ; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
8 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
9 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1410 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1511 ; X32-NEXT: retl
1612 ;
3026 define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
3127 ; X32-LABEL: A2:
3228 ; X32: ## BB#0: ## %entry
29 ; X32-NEXT: pushl %esi
30 ; X32-NEXT: Lcfi0:
31 ; X32-NEXT: .cfi_def_cfa_offset 8
32 ; X32-NEXT: Lcfi1:
33 ; X32-NEXT: .cfi_offset %esi, -8
3334 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
3435 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
3536 ; X32-NEXT: movl (%ecx), %edx
36 ; X32-NEXT: movl 4(%ecx), %ecx
37 ; X32-NEXT: movl %ecx, 4(%eax)
37 ; X32-NEXT: movl 4(%ecx), %esi
38 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
39 ; X32-NEXT: movl %esi, 4(%eax)
3840 ; X32-NEXT: movl %edx, (%eax)
39 ; X32-NEXT: vmovd %edx, %xmm0
40 ; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
41 ; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
42 ; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
41 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
4342 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
43 ; X32-NEXT: popl %esi
4444 ; X32-NEXT: retl
4545 ;
4646 ; X64-LABEL: A2:
591591 ; X32-LABEL: G:
592592 ; X32: ## BB#0: ## %entry
593593 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
594 ; X32-NEXT: movl (%eax), %ecx
595 ; X32-NEXT: movl 4(%eax), %eax
596 ; X32-NEXT: vmovd %ecx, %xmm0
597 ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
598 ; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
599 ; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
594 ; X32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
595 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
600596 ; X32-NEXT: retl
601597 ;
602598 ; X64-LABEL: G:
614610 define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
615611 ; X32-LABEL: G2:
616612 ; X32: ## BB#0: ## %entry
613 ; X32-NEXT: pushl %esi
614 ; X32-NEXT: Lcfi2:
615 ; X32-NEXT: .cfi_def_cfa_offset 8
616 ; X32-NEXT: Lcfi3:
617 ; X32-NEXT: .cfi_offset %esi, -8
617618 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
618619 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
619620 ; X32-NEXT: movl (%ecx), %edx
620 ; X32-NEXT: movl 4(%ecx), %ecx
621 ; X32-NEXT: movl %ecx, 4(%eax)
621 ; X32-NEXT: movl 4(%ecx), %esi
622 ; X32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
623 ; X32-NEXT: movl %esi, 4(%eax)
622624 ; X32-NEXT: movl %edx, (%eax)
623 ; X32-NEXT: vmovd %edx, %xmm0
624 ; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
625 ; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
626 ; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
625 ; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
626 ; X32-NEXT: popl %esi
627627 ; X32-NEXT: retl
628628 ;
629629 ; X64-LABEL: G2:
188188 ; X32-LABEL: Q64:
189189 ; X32: ## BB#0: ## %entry
190190 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
191 ; X32-NEXT: movl (%eax), %ecx
192 ; X32-NEXT: movl 4(%eax), %eax
193 ; X32-NEXT: vmovd %ecx, %xmm0
194 ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
195 ; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
196 ; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
191 ; X32-NEXT: vpbroadcastq (%eax), %xmm0
197192 ; X32-NEXT: retl
198193 ;
199194 ; X64-LABEL: Q64:
211206 ; X32-LABEL: QQ64:
212207 ; X32: ## BB#0: ## %entry
213208 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
214 ; X32-NEXT: movl (%eax), %ecx
215 ; X32-NEXT: movl 4(%eax), %eax
216 ; X32-NEXT: vmovd %ecx, %xmm0
217 ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
218 ; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
219 ; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
220 ; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
209 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
210 ; X32-NEXT: vbroadcastsd %xmm0, %ymm0
221211 ; X32-NEXT: retl
222212 ;
223213 ; X64-LABEL: QQ64:
14391429 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
14401430 ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
14411431 ; X32-NEXT: vmovaps %xmm0, (%esp)
1442 ; X32-NEXT: movl (%eax), %ecx
1443 ; X32-NEXT: movl 4(%eax), %eax
1444 ; X32-NEXT: vmovd %ecx, %xmm1
1445 ; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1446 ; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
1447 ; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
1432 ; X32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
1433 ; X32-NEXT: vpbroadcastq %xmm1, %xmm1
14481434 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
14491435 ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
14501436 ; X32-NEXT: addl $60, %esp
15001486 ; X32-NEXT: movl 8(%ebp), %eax
15011487 ; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
15021488 ; X32-NEXT: vmovaps %ymm0, (%esp)
1503 ; X32-NEXT: movl (%eax), %ecx
1504 ; X32-NEXT: movl 4(%eax), %eax
1505 ; X32-NEXT: vmovd %ecx, %xmm1
1506 ; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1507 ; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
1508 ; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
1509 ; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
1489 ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
1490 ; X32-NEXT: vbroadcastsd %xmm1, %ymm1
15101491 ; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
1511 ; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
1492 ; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
15121493 ; X32-NEXT: movl %ebp, %esp
15131494 ; X32-NEXT: popl %ebp
15141495 ; X32-NEXT: vzeroupper
11011101 ;
11021102
11031103 define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
1104 ; SSE-LABEL: merge_4f32_f32_X0YY:
1105 ; SSE: # BB#0:
1106 ; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1107 ; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1108 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1109 ; SSE-NEXT: retq
1104 ; SSE2-LABEL: merge_4f32_f32_X0YY:
1105 ; SSE2: # BB#0:
1106 ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1107 ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1108 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1109 ; SSE2-NEXT: retq
1110 ;
1111 ; SSE41-LABEL: merge_4f32_f32_X0YY:
1112 ; SSE41: # BB#0:
1113 ; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1114 ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,mem[0],zero
1115 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
1116 ; SSE41-NEXT: retq
11101117 ;
11111118 ; AVX-LABEL: merge_4f32_f32_X0YY:
11121119 ; AVX: # BB#0:
11131120 ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1114 ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1115 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
1116 ; AVX-NEXT: retq
1117 ;
1118 ; X32-SSE-LABEL: merge_4f32_f32_X0YY:
1119 ; X32-SSE: # BB#0:
1120 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
1121 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
1122 ; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1123 ; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1124 ; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1125 ; X32-SSE-NEXT: retl
1121 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,mem[0],zero
1122 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
1123 ; AVX-NEXT: retq
1124 ;
1125 ; X32-SSE1-LABEL: merge_4f32_f32_X0YY:
1126 ; X32-SSE1: # BB#0:
1127 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
1128 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1129 ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1130 ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1131 ; X32-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
1132 ; X32-SSE1-NEXT: retl
1133 ;
1134 ; X32-SSE41-LABEL: merge_4f32_f32_X0YY:
1135 ; X32-SSE41: # BB#0:
1136 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
1137 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
1138 ; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1139 ; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,mem[0],zero
1140 ; X32-SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
1141 ; X32-SSE41-NEXT: retl
11261142 %val0 = load float, float* %ptr0, align 4
11271143 %val1 = load float, float* %ptr1, align 4
11281144 %res0 = insertelement <4 x float> undef, float %val0, i32 0
24242424 ; X32-LABEL: test_mm_set1_epi64x:
24252425 ; X32: # BB#0:
24262426 ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2427 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
24282427 ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2429 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
24302428 ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2429 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
24312430 ; X32-NEXT: retl
24322431 ;
24332432 ; X64-LABEL: test_mm_set1_epi64x:
536536 ; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
537537 ; VEX-NEXT: vcvttsd2si %xmm0, %rax
538538 ; VEX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
539 ; VEX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
539 ; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
540540 ; VEX-NEXT: retq
541541 ;
542542 ; AVX512F-LABEL: fptoui_4f64_to_2i32:
11761176 ; SSE-NEXT: movd %xmm0, %rax
11771177 ; SSE-NEXT: xorps %xmm0, %xmm0
11781178 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1179 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
11801179 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1180 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,2]
11811181 ; SSE-NEXT: movaps %xmm1, %xmm0
11821182 ; SSE-NEXT: retq
11831183 ;
18781878 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
18791879 ; SSE-NEXT: addss %xmm1, %xmm1
18801880 ; SSE-NEXT: .LBB41_8:
1881 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
18821881 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1882 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,2]
18831883 ; SSE-NEXT: retq
18841884 ;
18851885 ; VEX-LABEL: uitofp_4i64_to_4f32_undef:
12621262 ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
12631263 ; X32-SSE41-NEXT: movzbl (%eax), %eax
12641264 ; X32-SSE41-NEXT: movl %eax, %ecx
1265 ; X32-SSE41-NEXT: shll $31, %ecx
1265 ; X32-SSE41-NEXT: shll $30, %ecx
12661266 ; X32-SSE41-NEXT: sarl $31, %ecx
1267 ; X32-SSE41-NEXT: movd %ecx, %xmm0
1268 ; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
1269 ; X32-SSE41-NEXT: shll $30, %eax
1267 ; X32-SSE41-NEXT: shll $31, %eax
12701268 ; X32-SSE41-NEXT: sarl $31, %eax
1271 ; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
1272 ; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
1269 ; X32-SSE41-NEXT: movd %eax, %xmm0
1270 ; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm0
1271 ; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
12731272 ; X32-SSE41-NEXT: retl
12741273 entry:
12751274 %X = load <2 x i1>, <2 x i1>* %ptr
317317 ret <4 x i32> %res3
318318 }
319319
320 ; FIXME: Duplicated load in i686
321320 define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
322321 ; X32-LABEL: buildvector_v4f32_0404:
323322 ; X32: # BB#0:
324323 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
325324 ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
326 ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
327 ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
328 ; X32-NEXT: vmovaps %xmm0, (%eax)
325 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
326 ; X32-NEXT: vmovapd %xmm0, (%eax)
329327 ; X32-NEXT: retl
330328 ;
331329 ; X64-LABEL: buildvector_v4f32_0404:
332330 ; X64: # BB#0:
333 ; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0]
334 ; X64-NEXT: vmovaps %xmm0, (%rdi)
331 ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
332 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
333 ; X64-NEXT: vmovapd %xmm0, (%rdi)
335334 ; X64-NEXT: retq
336335 %v0 = insertelement <4 x float> undef, float %a, i32 0
337336 %v1 = insertelement <4 x float> %v0, float %b, i32 1
2727 ; X32-LABEL: shift1b:
2828 ; X32: # BB#0: # %entry
2929 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
30 ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
31 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
32 ; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
33 ; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
34 ; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
35 ; X32-NEXT: psllq %xmm2, %xmm0
30 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
31 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
32 ; X32-NEXT: psllq %xmm1, %xmm0
3633 ; X32-NEXT: movdqa %xmm0, (%eax)
3734 ; X32-NEXT: retl
3835 ;
2727 ; X32-LABEL: shift1b:
2828 ; X32: # BB#0: # %entry
2929 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
30 ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
31 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
32 ; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
33 ; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
34 ; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
35 ; X32-NEXT: psrlq %xmm2, %xmm0
30 ; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
31 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
32 ; X32-NEXT: psrlq %xmm1, %xmm0
3633 ; X32-NEXT: movdqa %xmm0, (%eax)
3734 ; X32-NEXT: retl
3835 ;