llvm.org GIT mirror llvm / 2560837
[TargetLowering] SimplifyDemandedBits - call SimplifyMultipleUseDemandedBits for ISD::VECTOR_SHUFFLE In particular this helps the SSE vector shift cvttps2dq+add+shl pattern by avoiding the need for zeros in shuffle style extensions to vXi32 types as we'll be shifting out those bits anyway git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@368155 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 months ago
18 changed file(s) with 571 addition(s) and 599 deletion(s). Raw diff Collapse all Expand all
936936 }
937937
938938 if (!!DemandedLHS || !!DemandedRHS) {
939 SDValue Op0 = Op.getOperand(0);
940 SDValue Op1 = Op.getOperand(1);
941
939942 Known.Zero.setAllBits();
940943 Known.One.setAllBits();
941944 if (!!DemandedLHS) {
942 if (SimplifyDemandedBits(Op.getOperand(0), DemandedBits, DemandedLHS,
943 Known2, TLO, Depth + 1))
945 if (SimplifyDemandedBits(Op0, DemandedBits, DemandedLHS, Known2, TLO,
946 Depth + 1))
944947 return true;
945948 Known.One &= Known2.One;
946949 Known.Zero &= Known2.Zero;
947950 }
948951 if (!!DemandedRHS) {
949 if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedRHS,
950 Known2, TLO, Depth + 1))
952 if (SimplifyDemandedBits(Op1, DemandedBits, DemandedRHS, Known2, TLO,
953 Depth + 1))
951954 return true;
952955 Known.One &= Known2.One;
953956 Known.Zero &= Known2.Zero;
957 }
958
959 // Attempt to avoid multi-use ops if we don't need anything from them.
960 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
961 Op0, DemandedBits, DemandedLHS, TLO.DAG, Depth + 1);
962 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
963 Op1, DemandedBits, DemandedRHS, TLO.DAG, Depth + 1);
964 if (DemandedOp0 || DemandedOp1) {
965 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
966 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
967 SDValue NewOp = TLO.DAG.getVectorShuffle(VT, dl, Op0, Op1, ShuffleMask);
968 return TLO.CombineTo(Op, NewOp);
954969 }
955970 }
956971 break;
171171 ;
172172 ; X86-AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
173173 ; X86-AVX1: # %bb.0:
174 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
175 ; X86-AVX1-NEXT: vpsllq $63, %xmm1, %xmm2
176 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
177 ; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm2
174 ; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1
175 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
176 ; X86-AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
177 ; X86-AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
178 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,0,2147483648]
179 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
180 ; X86-AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
178181 ; X86-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
179 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,0,2147483648]
180 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
181 ; X86-AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
182 ; X86-AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
183 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
182 ; X86-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
184183 ; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
185184 ; X86-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
186 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
185 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
187186 ; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
188187 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
189188 ; X86-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
224223 ;
225224 ; X64-AVX1-LABEL: trunc_ashr_v4i64_demandedelts:
226225 ; X64-AVX1: # %bb.0:
227 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
228 ; X64-AVX1-NEXT: vpsllq $63, %xmm1, %xmm2
229 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
230 ; X64-AVX1-NEXT: vpsllq $63, %xmm0, %xmm2
226 ; X64-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1
227 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
228 ; X64-AVX1-NEXT: vpsllq $63, %xmm2, %xmm2
229 ; X64-AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
230 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9223372036854775808]
231 ; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
232 ; X64-AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
231233 ; X64-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
232 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,9223372036854775808]
233 ; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
234 ; X64-AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
235 ; X64-AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
236 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
234 ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
237235 ; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
238236 ; X64-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
239 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
237 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
240238 ; X64-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
241239 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
242240 ; X64-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
11181118 define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
11191119 ; SSE2-LABEL: mul_v4i64_zero_lower:
11201120 ; SSE2: # %bb.0: # %entry
1121 ; SSE2-NEXT: pxor %xmm4, %xmm4
1122 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1123 ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1124 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
1121 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,1,3]
1122 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
11251123 ; SSE2-NEXT: psrlq $32, %xmm2
11261124 ; SSE2-NEXT: pmuludq %xmm0, %xmm2
11271125 ; SSE2-NEXT: psrlq $32, %xmm1
11191119 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
11201120 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
11211121 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
1122 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,1,100]
1123 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
1124 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
1125 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
1126 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5
1127 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0]
1128 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0,2]
1129 ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
1122 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,14,1,100]
1123 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1124 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
1125 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1126 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4
1127 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
1128 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2]
1129 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
11301130 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
11311131 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
11321132 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
19131913 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
19141914 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
19151915 ; CHECK-SSE2-NEXT: psrld $31, %xmm1
1916 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
1917 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
1918 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,4294967295,1,14]
1919 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
1920 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
1921 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
1922 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
1923 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5
1924 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0]
1925 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2]
1926 ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
1927 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1928 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1929 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
1916 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[3,3]
1917 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,1,14]
1918 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
1919 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4
1920 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
1921 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm4
1922 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
1923 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2]
1924 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
1925 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1926 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1927 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
19301928 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
19311929 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
19321930 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
20292027 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
20302028 ; CHECK-SSE2-NEXT: psrld $31, %xmm3
20312029 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
2032 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,1,100]
2033 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
2034 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
2035 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
2036 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
2037 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0]
2030 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,4294967295,1,100]
2031 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
2032 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4
2033 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
20382034 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
2039 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,2]
2040 ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
2035 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
2036 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
2037 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2]
2038 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
20412039 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
20422040 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
20432041 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
21392137 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3
21402138 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
21412139 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2142 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
2143 ; CHECK-SSE2-NEXT: psrld $2, %xmm1
2144 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
2145 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
2146 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,1,5]
2147 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
2148 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
2149 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
2150 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0]
2151 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
2152 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0]
2153 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,2]
2154 ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
2155 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
2156 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2157 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
2140 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
2141 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
2142 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
2143 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
2144 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
2145 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,1,5]
2146 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2
2147 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2148 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2149 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
2150 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2151 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2152 ; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0
21582153 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
21592154 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
21602155 ; CHECK-SSE2-NEXT: psrld $31, %xmm0
22472242 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1
22482243 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
22492244 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2250 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2
2251 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
2252 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3
2253 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
2254 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [14,16,1,14]
2255 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
2256 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
2257 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
2258 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
2259 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm5
2260 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[3,0]
2261 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,2]
2262 ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
2245 ; CHECK-SSE2-NEXT: psrld $2, %xmm1
2246 ; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2
2247 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
2248 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3]
2249 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
2250 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14]
2251 ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1
22632252 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2264 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2253 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2254 ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
2255 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2256 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
22652257 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
22662258 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
22672259 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
23582350 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
23592351 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
23602352 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1
2361 ; CHECK-SSE2-NEXT: psrld $5, %xmm1
2362 ; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3
2363 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3]
2353 ; CHECK-SSE2-NEXT: psrld $2, %xmm1
2354 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4
2355 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
2356 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
23642357 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,16,1,100]
2365 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
2366 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5
2367 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
2368 ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5
2369 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[3,0]
2370 ; CHECK-SSE2-NEXT: psrld $2, %xmm2
2371 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,2]
2372 ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2
2373 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
2374 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2358 ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1
2359 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2360 ; CHECK-SSE2-NEXT: psrld $5, %xmm2
2361 ; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3]
2362 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
2363 ; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2
2364 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2365 ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
23752366 ; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0
23762367 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1
23772368 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0
530530 ; SSE2-NEXT: psrlw $1, %xmm1
531531 ; SSE2-NEXT: pand %xmm3, %xmm1
532532 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
533 ; SSE2-NEXT: pxor %xmm3, %xmm3
534 ; SSE2-NEXT: movdqa %xmm2, %xmm5
535 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
536 ; SSE2-NEXT: pslld $23, %xmm5
533 ; SSE2-NEXT: movdqa %xmm2, %xmm3
534 ; SSE2-NEXT: pxor %xmm5, %xmm5
535 ; SSE2-NEXT: pcmpeqw %xmm2, %xmm5
536 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
537 ; SSE2-NEXT: pslld $23, %xmm2
537538 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
538 ; SSE2-NEXT: paddd %xmm6, %xmm5
539 ; SSE2-NEXT: cvttps2dq %xmm5, %xmm5
540 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
541 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
542 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
543 ; SSE2-NEXT: movdqa %xmm2, %xmm7
544 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
545 ; SSE2-NEXT: pslld $23, %xmm7
546 ; SSE2-NEXT: paddd %xmm6, %xmm7
547 ; SSE2-NEXT: cvttps2dq %xmm7, %xmm6
548 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
549 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
550 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
551 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
552 ; SSE2-NEXT: pmullw %xmm0, %xmm6
553 ; SSE2-NEXT: por %xmm4, %xmm6
554 ; SSE2-NEXT: por %xmm1, %xmm6
555 ; SSE2-NEXT: pcmpeqw %xmm3, %xmm2
556 ; SSE2-NEXT: pand %xmm2, %xmm0
557 ; SSE2-NEXT: pandn %xmm6, %xmm2
558 ; SSE2-NEXT: por %xmm2, %xmm0
539 ; SSE2-NEXT: paddd %xmm6, %xmm2
540 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
541 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
542 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
543 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
544 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
545 ; SSE2-NEXT: pslld $23, %xmm3
546 ; SSE2-NEXT: paddd %xmm6, %xmm3
547 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
548 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
549 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
550 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
551 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
552 ; SSE2-NEXT: pmullw %xmm0, %xmm3
553 ; SSE2-NEXT: por %xmm4, %xmm3
554 ; SSE2-NEXT: por %xmm1, %xmm3
555 ; SSE2-NEXT: pand %xmm5, %xmm0
556 ; SSE2-NEXT: pandn %xmm3, %xmm5
557 ; SSE2-NEXT: por %xmm5, %xmm0
559558 ; SSE2-NEXT: retq
560559 ;
561560 ; SSE41-LABEL: var_funnnel_v8i16:
587586 ; SSE41-NEXT: paddw %xmm4, %xmm4
588587 ; SSE41-NEXT: movdqa %xmm4, %xmm0
589588 ; SSE41-NEXT: pblendvb %xmm0, %xmm5, %xmm1
589 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
590590 ; SSE41-NEXT: pxor %xmm0, %xmm0
591 ; SSE41-NEXT: movdqa %xmm2, %xmm4
592 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
593 ; SSE41-NEXT: pslld $23, %xmm4
591 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
592 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
593 ; SSE41-NEXT: pslld $23, %xmm2
594594 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
595 ; SSE41-NEXT: paddd %xmm5, %xmm4
596 ; SSE41-NEXT: cvttps2dq %xmm4, %xmm6
597 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
595 ; SSE41-NEXT: paddd %xmm5, %xmm2
596 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm6
598597 ; SSE41-NEXT: pslld $23, %xmm4
599598 ; SSE41-NEXT: paddd %xmm5, %xmm4
600 ; SSE41-NEXT: cvttps2dq %xmm4, %xmm4
601 ; SSE41-NEXT: packusdw %xmm6, %xmm4
602 ; SSE41-NEXT: pmullw %xmm3, %xmm4
603 ; SSE41-NEXT: por %xmm1, %xmm4
604 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0
605 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm4
606 ; SSE41-NEXT: movdqa %xmm4, %xmm0
599 ; SSE41-NEXT: cvttps2dq %xmm4, %xmm2
600 ; SSE41-NEXT: packusdw %xmm6, %xmm2
601 ; SSE41-NEXT: pmullw %xmm3, %xmm2
602 ; SSE41-NEXT: por %xmm1, %xmm2
603 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
604 ; SSE41-NEXT: movdqa %xmm2, %xmm0
607605 ; SSE41-NEXT: retq
608606 ;
609607 ; AVX1-LABEL: var_funnnel_v8i16:
625623 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3
626624 ; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
627625 ; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1
626 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
627 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
628 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
629 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
630 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
631 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
632 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
633 ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
634 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
635 ; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
636 ; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm3
637 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
628638 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
629 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
630 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
631 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
632 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
633 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
634 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
635 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
636 ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5
637 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
638 ; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4
639 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm4
640 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
641639 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
642640 ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
643641 ; AVX1-NEXT: retq
800798 ; X32-SSE-NEXT: psrlw $1, %xmm1
801799 ; X32-SSE-NEXT: pand %xmm3, %xmm1
802800 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
803 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
804 ; X32-SSE-NEXT: movdqa %xmm2, %xmm5
805 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
806 ; X32-SSE-NEXT: pslld $23, %xmm5
801 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
802 ; X32-SSE-NEXT: pxor %xmm5, %xmm5
803 ; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm5
804 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
805 ; X32-SSE-NEXT: pslld $23, %xmm2
807806 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
808 ; X32-SSE-NEXT: paddd %xmm6, %xmm5
809 ; X32-SSE-NEXT: cvttps2dq %xmm5, %xmm5
810 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
811 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
812 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
813 ; X32-SSE-NEXT: movdqa %xmm2, %xmm7
814 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
815 ; X32-SSE-NEXT: pslld $23, %xmm7
816 ; X32-SSE-NEXT: paddd %xmm6, %xmm7
817 ; X32-SSE-NEXT: cvttps2dq %xmm7, %xmm6
818 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
819 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
820 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
821 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
822 ; X32-SSE-NEXT: pmullw %xmm0, %xmm6
823 ; X32-SSE-NEXT: por %xmm4, %xmm6
824 ; X32-SSE-NEXT: por %xmm1, %xmm6
825 ; X32-SSE-NEXT: pcmpeqw %xmm3, %xmm2
826 ; X32-SSE-NEXT: pand %xmm2, %xmm0
827 ; X32-SSE-NEXT: pandn %xmm6, %xmm2
828 ; X32-SSE-NEXT: por %xmm2, %xmm0
807 ; X32-SSE-NEXT: paddd %xmm6, %xmm2
808 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
809 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
810 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
811 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
812 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
813 ; X32-SSE-NEXT: pslld $23, %xmm3
814 ; X32-SSE-NEXT: paddd %xmm6, %xmm3
815 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
816 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
817 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
818 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
819 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
820 ; X32-SSE-NEXT: pmullw %xmm0, %xmm3
821 ; X32-SSE-NEXT: por %xmm4, %xmm3
822 ; X32-SSE-NEXT: por %xmm1, %xmm3
823 ; X32-SSE-NEXT: pand %xmm5, %xmm0
824 ; X32-SSE-NEXT: pandn %xmm3, %xmm5
825 ; X32-SSE-NEXT: por %xmm5, %xmm0
829826 ; X32-SSE-NEXT: retl
830827 %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
831828 ret <8 x i16> %res
402402 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5
403403 ; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
404404 ; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
405 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm8
406 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
407 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
408 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
409 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
410 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
405 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
406 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
407 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
408 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
409 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
410 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
411 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
412 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
413 ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm6
414 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
415 ; AVX1-NEXT: vpackusdw %xmm4, %xmm6, %xmm4
416 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
417 ; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
418 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
419 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
420 ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm6
421 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
422 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
423 ; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
424 ; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5
411425 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
412 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
413 ; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
414 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
415 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
416 ; AVX1-NEXT: vpackusdw %xmm5, %xmm7, %xmm5
417 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
418 ; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5
419 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
420 ; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
421 ; AVX1-NEXT: vpaddd %xmm6, %xmm7, %xmm7
422 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
423 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
424 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
425 ; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1
426 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
427 ; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1
428 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm1
429 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
430 ; AVX1-NEXT: vorps %ymm8, %ymm1, %ymm1
426 ; AVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
427 ; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm5
428 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
429 ; AVX1-NEXT: vorps %ymm1, %ymm4, %ymm1
430 ; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
431431 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
432432 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
433433 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
275275 ; SSE2-LABEL: var_funnnel_v8i16:
276276 ; SSE2: # %bb.0:
277277 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
278 ; SSE2-NEXT: pxor %xmm2, %xmm2
279 ; SSE2-NEXT: movdqa %xmm1, %xmm3
280 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
281 ; SSE2-NEXT: pslld $23, %xmm3
282 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
283 ; SSE2-NEXT: paddd %xmm4, %xmm3
284 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
285 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
286 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
287 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
288 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
278 ; SSE2-NEXT: movdqa %xmm1, %xmm2
279 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
280 ; SSE2-NEXT: pslld $23, %xmm2
281 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
282 ; SSE2-NEXT: paddd %xmm3, %xmm2
283 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
284 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
285 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
286 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
287 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
289288 ; SSE2-NEXT: pslld $23, %xmm1
290 ; SSE2-NEXT: paddd %xmm4, %xmm1
289 ; SSE2-NEXT: paddd %xmm3, %xmm1
291290 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
292291 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
293292 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
294293 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
295 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
294 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
296295 ; SSE2-NEXT: movdqa %xmm0, %xmm2
297296 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2
298297 ; SSE2-NEXT: pmullw %xmm1, %xmm0
416415 ; X32-SSE-LABEL: var_funnnel_v8i16:
417416 ; X32-SSE: # %bb.0:
418417 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
419 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
420 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
421 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
422 ; X32-SSE-NEXT: pslld $23, %xmm3
423 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
424 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
425 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
426 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
427 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
428 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
429 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
418 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
419 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
420 ; X32-SSE-NEXT: pslld $23, %xmm2
421 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
422 ; X32-SSE-NEXT: paddd %xmm3, %xmm2
423 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
424 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
425 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
426 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
427 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
430428 ; X32-SSE-NEXT: pslld $23, %xmm1
431 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
429 ; X32-SSE-NEXT: paddd %xmm3, %xmm1
432430 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
433431 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
434432 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
435433 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
436 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
434 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
437435 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
438436 ; X32-SSE-NEXT: pmulhuw %xmm1, %xmm2
439437 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
209209 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
210210 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
211211 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
212 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
213 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
214 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
215 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
216 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
217 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
212 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
213 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
214 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
215 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
216 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
218217 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
219218 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
220 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
219 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
221220 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
222 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
223 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
224 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7
225 ; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2
226 ; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
221 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
222 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
223 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
224 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
225 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
227226 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
228 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
227 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
229228 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
230 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
229 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
231230 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
232231 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
233232 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
234 ; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1
233 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
235234 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
236235 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
237236 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
504504 ; SSE2-LABEL: var_funnnel_v8i16:
505505 ; SSE2: # %bb.0:
506506 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
507 ; SSE2-NEXT: movdqa %xmm2, %xmm4
508 ; SSE2-NEXT: psllw $12, %xmm4
509 ; SSE2-NEXT: movdqa %xmm4, %xmm3
510 ; SSE2-NEXT: psraw $15, %xmm3
511 ; SSE2-NEXT: movdqa %xmm1, %xmm5
512 ; SSE2-NEXT: psrlw $8, %xmm5
513 ; SSE2-NEXT: pand %xmm3, %xmm5
514 ; SSE2-NEXT: pandn %xmm1, %xmm3
515 ; SSE2-NEXT: por %xmm5, %xmm3
516 ; SSE2-NEXT: paddw %xmm4, %xmm4
517 ; SSE2-NEXT: movdqa %xmm4, %xmm5
518 ; SSE2-NEXT: psraw $15, %xmm5
519 ; SSE2-NEXT: movdqa %xmm5, %xmm6
520 ; SSE2-NEXT: pandn %xmm3, %xmm6
521 ; SSE2-NEXT: psrlw $4, %xmm3
522 ; SSE2-NEXT: pand %xmm5, %xmm3
523 ; SSE2-NEXT: por %xmm6, %xmm3
524 ; SSE2-NEXT: paddw %xmm4, %xmm4
525 ; SSE2-NEXT: movdqa %xmm4, %xmm5
526 ; SSE2-NEXT: psraw $15, %xmm5
527 ; SSE2-NEXT: movdqa %xmm5, %xmm6
528 ; SSE2-NEXT: pandn %xmm3, %xmm6
529 ; SSE2-NEXT: psrlw $2, %xmm3
530 ; SSE2-NEXT: pand %xmm5, %xmm3
531 ; SSE2-NEXT: por %xmm6, %xmm3
532 ; SSE2-NEXT: paddw %xmm4, %xmm4
533 ; SSE2-NEXT: psraw $15, %xmm4
534 ; SSE2-NEXT: movdqa %xmm4, %xmm5
535 ; SSE2-NEXT: pandn %xmm3, %xmm5
536 ; SSE2-NEXT: psrlw $1, %xmm3
537 ; SSE2-NEXT: pand %xmm4, %xmm3
538507 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
539508 ; SSE2-NEXT: psubw %xmm2, %xmm4
540 ; SSE2-NEXT: pxor %xmm8, %xmm8
541 ; SSE2-NEXT: movdqa %xmm4, %xmm7
542 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
543 ; SSE2-NEXT: pslld $23, %xmm7
544 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
545 ; SSE2-NEXT: paddd %xmm6, %xmm7
546 ; SSE2-NEXT: cvttps2dq %xmm7, %xmm7
547 ; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
548 ; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
549 ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
550 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
509 ; SSE2-NEXT: pxor %xmm3, %xmm3
510 ; SSE2-NEXT: pcmpeqw %xmm2, %xmm3
511 ; SSE2-NEXT: psllw $12, %xmm2
512 ; SSE2-NEXT: movdqa %xmm2, %xmm5
513 ; SSE2-NEXT: psraw $15, %xmm5
514 ; SSE2-NEXT: movdqa %xmm1, %xmm6
515 ; SSE2-NEXT: psrlw $8, %xmm6
516 ; SSE2-NEXT: pand %xmm5, %xmm6
517 ; SSE2-NEXT: pandn %xmm1, %xmm5
518 ; SSE2-NEXT: por %xmm6, %xmm5
519 ; SSE2-NEXT: paddw %xmm2, %xmm2
520 ; SSE2-NEXT: movdqa %xmm2, %xmm6
521 ; SSE2-NEXT: psraw $15, %xmm6
522 ; SSE2-NEXT: movdqa %xmm6, %xmm7
523 ; SSE2-NEXT: pandn %xmm5, %xmm7
524 ; SSE2-NEXT: psrlw $4, %xmm5
525 ; SSE2-NEXT: pand %xmm6, %xmm5
526 ; SSE2-NEXT: por %xmm7, %xmm5
527 ; SSE2-NEXT: paddw %xmm2, %xmm2
528 ; SSE2-NEXT: movdqa %xmm2, %xmm6
529 ; SSE2-NEXT: psraw $15, %xmm6
530 ; SSE2-NEXT: movdqa %xmm6, %xmm7
531 ; SSE2-NEXT: pandn %xmm5, %xmm7
532 ; SSE2-NEXT: psrlw $2, %xmm5
533 ; SSE2-NEXT: pand %xmm6, %xmm5
534 ; SSE2-NEXT: por %xmm7, %xmm5
535 ; SSE2-NEXT: paddw %xmm2, %xmm2
536 ; SSE2-NEXT: psraw $15, %xmm2
537 ; SSE2-NEXT: movdqa %xmm2, %xmm6
538 ; SSE2-NEXT: pandn %xmm5, %xmm6
539 ; SSE2-NEXT: psrlw $1, %xmm5
540 ; SSE2-NEXT: pand %xmm2, %xmm5
541 ; SSE2-NEXT: movdqa %xmm4, %xmm2
542 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
543 ; SSE2-NEXT: pslld $23, %xmm2
544 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
545 ; SSE2-NEXT: paddd %xmm7, %xmm2
546 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
547 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
548 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
549 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
550 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
551551 ; SSE2-NEXT: pslld $23, %xmm4
552 ; SSE2-NEXT: paddd %xmm6, %xmm4
552 ; SSE2-NEXT: paddd %xmm7, %xmm4
553553 ; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
554554 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
555555 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
556556 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
557 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
557 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
558558 ; SSE2-NEXT: pmullw %xmm0, %xmm4
559 ; SSE2-NEXT: por %xmm6, %xmm4
559560 ; SSE2-NEXT: por %xmm5, %xmm4
560 ; SSE2-NEXT: por %xmm3, %xmm4
561 ; SSE2-NEXT: pcmpeqw %xmm8, %xmm2
562 ; SSE2-NEXT: pand %xmm2, %xmm1
563 ; SSE2-NEXT: pandn %xmm4, %xmm2
564 ; SSE2-NEXT: por %xmm1, %xmm2
565 ; SSE2-NEXT: movdqa %xmm2, %xmm0
561 ; SSE2-NEXT: pand %xmm3, %xmm1
562 ; SSE2-NEXT: pandn %xmm4, %xmm3
563 ; SSE2-NEXT: por %xmm1, %xmm3
564 ; SSE2-NEXT: movdqa %xmm3, %xmm0
566565 ; SSE2-NEXT: retq
567566 ;
568567 ; SSE41-LABEL: var_funnnel_v8i16:
569568 ; SSE41: # %bb.0:
570 ; SSE41-NEXT: movdqa %xmm0, %xmm8
569 ; SSE41-NEXT: movdqa %xmm0, %xmm3
571570 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
572571 ; SSE41-NEXT: movdqa %xmm2, %xmm0
573572 ; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
574573 ; SSE41-NEXT: psubw %xmm2, %xmm5
575574 ; SSE41-NEXT: pxor %xmm4, %xmm4
576 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
577 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
578575 ; SSE41-NEXT: pcmpeqw %xmm2, %xmm4
579576 ; SSE41-NEXT: psllw $12, %xmm2
580577 ; SSE41-NEXT: psllw $4, %xmm0
581578 ; SSE41-NEXT: por %xmm2, %xmm0
582579 ; SSE41-NEXT: movdqa %xmm0, %xmm2
583580 ; SSE41-NEXT: paddw %xmm0, %xmm2
581 ; SSE41-NEXT: movdqa %xmm1, %xmm6
582 ; SSE41-NEXT: psrlw $8, %xmm6
584583 ; SSE41-NEXT: movdqa %xmm1, %xmm7
585 ; SSE41-NEXT: psrlw $8, %xmm7
586 ; SSE41-NEXT: movdqa %xmm1, %xmm3
587 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
588 ; SSE41-NEXT: movdqa %xmm3, %xmm7
589 ; SSE41-NEXT: psrlw $4, %xmm7
584 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
585 ; SSE41-NEXT: movdqa %xmm7, %xmm6
586 ; SSE41-NEXT: psrlw $4, %xmm6
590587 ; SSE41-NEXT: movdqa %xmm2, %xmm0
591 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
592 ; SSE41-NEXT: movdqa %xmm3, %xmm7
593 ; SSE41-NEXT: psrlw $2, %xmm7
588 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
589 ; SSE41-NEXT: movdqa %xmm7, %xmm6
590 ; SSE41-NEXT: psrlw $2, %xmm6
594591 ; SSE41-NEXT: paddw %xmm2, %xmm2
595592 ; SSE41-NEXT: movdqa %xmm2, %xmm0
596 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
597 ; SSE41-NEXT: movdqa %xmm3, %xmm7
598 ; SSE41-NEXT: psrlw $1, %xmm7
593 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
594 ; SSE41-NEXT: movdqa %xmm7, %xmm6
595 ; SSE41-NEXT: psrlw $1, %xmm6
599596 ; SSE41-NEXT: paddw %xmm2, %xmm2
600597 ; SSE41-NEXT: movdqa %xmm2, %xmm0
601 ; SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm3
598 ; SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm7
599 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
600 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
602601 ; SSE41-NEXT: pslld $23, %xmm5
603 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
604 ; SSE41-NEXT: paddd %xmm0, %xmm5
605 ; SSE41-NEXT: cvttps2dq %xmm5, %xmm2
606 ; SSE41-NEXT: pslld $23, %xmm6
607 ; SSE41-NEXT: paddd %xmm0, %xmm6
608 ; SSE41-NEXT: cvttps2dq %xmm6, %xmm0
609 ; SSE41-NEXT: packusdw %xmm2, %xmm0
610 ; SSE41-NEXT: pmullw %xmm0, %xmm8
611 ; SSE41-NEXT: por %xmm3, %xmm8
602 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
603 ; SSE41-NEXT: paddd %xmm2, %xmm5
604 ; SSE41-NEXT: cvttps2dq %xmm5, %xmm5
605 ; SSE41-NEXT: pslld $23, %xmm0
606 ; SSE41-NEXT: paddd %xmm2, %xmm0
607 ; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
608 ; SSE41-NEXT: packusdw %xmm5, %xmm0
609 ; SSE41-NEXT: pmullw %xmm0, %xmm3
610 ; SSE41-NEXT: por %xmm7, %xmm3
612611 ; SSE41-NEXT: movdqa %xmm4, %xmm0
613 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm8
614 ; SSE41-NEXT: movdqa %xmm8, %xmm0
612 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
613 ; SSE41-NEXT: movdqa %xmm3, %xmm0
615614 ; SSE41-NEXT: retq
616615 ;
617616 ; AVX1-LABEL: var_funnnel_v8i16:
633632 ; AVX1-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
634633 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
635634 ; AVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm4
636 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
637 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
638 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
639 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
640 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
641 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
635 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
636 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
637 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
638 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
639 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
642640 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
643641 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
644 ; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
642 ; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm4
645643 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
646 ; AVX1-NEXT: vpackusdw %xmm6, %xmm4, %xmm4
644 ; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
647645 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
648646 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
649 ; AVX1-NEXT: vpcmpeqw %xmm5, %xmm2, %xmm2
647 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
648 ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
650649 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
651650 ; AVX1-NEXT: retq
652651 ;
777776 ;
778777 ; X32-SSE-LABEL: var_funnnel_v8i16:
779778 ; X32-SSE: # %bb.0:
780 ; X32-SSE-NEXT: subl $28, %esp
781 ; X32-SSE-NEXT: movups %xmm0, (%esp) # 16-byte Spill
782779 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
783 ; X32-SSE-NEXT: movdqa %xmm2, %xmm4
784 ; X32-SSE-NEXT: psllw $12, %xmm4
785 ; X32-SSE-NEXT: movdqa %xmm4, %xmm3
786 ; X32-SSE-NEXT: psraw $15, %xmm3
787 ; X32-SSE-NEXT: movdqa %xmm1, %xmm5
788 ; X32-SSE-NEXT: psrlw $8, %xmm5
789 ; X32-SSE-NEXT: pand %xmm3, %xmm5
790 ; X32-SSE-NEXT: pandn %xmm1, %xmm3
791 ; X32-SSE-NEXT: por %xmm5, %xmm3
792 ; X32-SSE-NEXT: paddw %xmm4, %xmm4
793 ; X32-SSE-NEXT: movdqa %xmm4, %xmm5
794 ; X32-SSE-NEXT: psraw $15, %xmm5
795 ; X32-SSE-NEXT: movdqa %xmm5, %xmm6
796 ; X32-SSE-NEXT: pandn %xmm3, %xmm6
797 ; X32-SSE-NEXT: psrlw $4, %xmm3
798 ; X32-SSE-NEXT: pand %xmm5, %xmm3
799 ; X32-SSE-NEXT: por %xmm6, %xmm3
800 ; X32-SSE-NEXT: paddw %xmm4, %xmm4
801 ; X32-SSE-NEXT: movdqa %xmm4, %xmm5
802 ; X32-SSE-NEXT: psraw $15, %xmm5
803 ; X32-SSE-NEXT: movdqa %xmm5, %xmm6
804 ; X32-SSE-NEXT: pandn %xmm3, %xmm6
805 ; X32-SSE-NEXT: psrlw $2, %xmm3
806 ; X32-SSE-NEXT: pand %xmm5, %xmm3
807 ; X32-SSE-NEXT: por %xmm6, %xmm3
808 ; X32-SSE-NEXT: paddw %xmm4, %xmm4
809 ; X32-SSE-NEXT: psraw $15, %xmm4
810 ; X32-SSE-NEXT: movdqa %xmm4, %xmm5
811 ; X32-SSE-NEXT: pandn %xmm3, %xmm5
812 ; X32-SSE-NEXT: psrlw $1, %xmm3
813 ; X32-SSE-NEXT: pand %xmm4, %xmm3
814780 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
815781 ; X32-SSE-NEXT: psubw %xmm2, %xmm4
816 ; X32-SSE-NEXT: pxor %xmm6, %xmm6
817 ; X32-SSE-NEXT: movdqa %xmm4, %xmm7
818 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
819 ; X32-SSE-NEXT: pslld $23, %xmm7
820 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,1065353216,1065353216]
821 ; X32-SSE-NEXT: paddd %xmm0, %xmm7
822 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
782 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
783 ; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm3
784 ; X32-SSE-NEXT: psllw $12, %xmm2
785 ; X32-SSE-NEXT: movdqa %xmm2, %xmm5
786 ; X32-SSE-NEXT: psraw $15, %xmm5
787 ; X32-SSE-NEXT: movdqa %xmm1, %xmm6
788 ; X32-SSE-NEXT: psrlw $8, %xmm6
789 ; X32-SSE-NEXT: pand %xmm5, %xmm6
790 ; X32-SSE-NEXT: pandn %xmm1, %xmm5
791 ; X32-SSE-NEXT: por %xmm6, %xmm5
792 ; X32-SSE-NEXT: paddw %xmm2, %xmm2
793 ; X32-SSE-NEXT: movdqa %xmm2, %xmm6
794 ; X32-SSE-NEXT: psraw $15, %xmm6
795 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
796 ; X32-SSE-NEXT: pandn %xmm5, %xmm7
797 ; X32-SSE-NEXT: psrlw $4, %xmm5
798 ; X32-SSE-NEXT: pand %xmm6, %xmm5
799 ; X32-SSE-NEXT: por %xmm7, %xmm5
800 ; X32-SSE-NEXT: paddw %xmm2, %xmm2
801 ; X32-SSE-NEXT: movdqa %xmm2, %xmm6
802 ; X32-SSE-NEXT: psraw $15, %xmm6
803 ; X32-SSE-NEXT: movdqa %xmm6, %xmm7
804 ; X32-SSE-NEXT: pandn %xmm5, %xmm7
805 ; X32-SSE-NEXT: psrlw $2, %xmm5
806 ; X32-SSE-NEXT: pand %xmm6, %xmm5
807 ; X32-SSE-NEXT: por %xmm7, %xmm5
808 ; X32-SSE-NEXT: paddw %xmm2, %xmm2
809 ; X32-SSE-NEXT: psraw $15, %xmm2
810 ; X32-SSE-NEXT: movdqa %xmm2, %xmm6
811 ; X32-SSE-NEXT: pandn %xmm5, %xmm6
812 ; X32-SSE-NEXT: psrlw $1, %xmm5
813 ; X32-SSE-NEXT: pand %xmm2, %xmm5
814 ; X32-SSE-NEXT: movdqa %xmm4, %xmm2
815 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
816 ; X32-SSE-NEXT: pslld $23, %xmm2
817 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm7 = [1065353216,1065353216,1065353216,1065353216]
818 ; X32-SSE-NEXT: paddd %xmm7, %xmm2
819 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
820 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
821 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
822 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
823 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
823824 ; X32-SSE-NEXT: pslld $23, %xmm4
824 ; X32-SSE-NEXT: paddd %xmm0, %xmm4
825 ; X32-SSE-NEXT: cvttps2dq %xmm7, %xmm0
826 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
827 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
828 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
825 ; X32-SSE-NEXT: paddd %xmm7, %xmm4
829826 ; X32-SSE-NEXT: cvttps2dq %xmm4, %xmm4
830827 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
831828 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
832829 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
833 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
834 ; X32-SSE-NEXT: movdqu (%esp), %xmm0 # 16-byte Reload
830 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
835831 ; X32-SSE-NEXT: pmullw %xmm0, %xmm4
832 ; X32-SSE-NEXT: por %xmm6, %xmm4
836833 ; X32-SSE-NEXT: por %xmm5, %xmm4
837 ; X32-SSE-NEXT: por %xmm3, %xmm4
838 ; X32-SSE-NEXT: pcmpeqw %xmm6, %xmm2
839 ; X32-SSE-NEXT: pand %xmm2, %xmm1
840 ; X32-SSE-NEXT: pandn %xmm4, %xmm2
841 ; X32-SSE-NEXT: por %xmm1, %xmm2
842 ; X32-SSE-NEXT: movdqa %xmm2, %xmm0
843 ; X32-SSE-NEXT: addl $28, %esp
834 ; X32-SSE-NEXT: pand %xmm3, %xmm1
835 ; X32-SSE-NEXT: pandn %xmm4, %xmm3
836 ; X32-SSE-NEXT: por %xmm1, %xmm3
837 ; X32-SSE-NEXT: movdqa %xmm3, %xmm0
844838 ; X32-SSE-NEXT: retl
845839 %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
846840 ret <8 x i16> %res
402402 ; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6
403403 ; AVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
404404 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm8
405 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [16,16,16,16,16,16,16,16]
406 ; AVX1-NEXT: vpsubw %xmm3, %xmm9, %xmm6
407 ; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10
408 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
409 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
410 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
411 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
412 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
413 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
414 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
415 ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm6
416 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
417 ; AVX1-NEXT: vpackusdw %xmm4, %xmm6, %xmm4
418 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
419 ; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4
420 ; AVX1-NEXT: vpsubw %xmm2, %xmm9, %xmm6
421 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
405 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
406 ; AVX1-NEXT: vpsubw %xmm3, %xmm5, %xmm6
407 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
422408 ; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
423 ; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm7
409 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
410 ; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
424411 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
425412 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
426413 ; AVX1-NEXT: vpslld $23, %xmm6, %xmm6
427 ; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5
428 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
429 ; AVX1-NEXT: vpackusdw %xmm7, %xmm5, %xmm5
430 ; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
431 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
414 ; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
415 ; AVX1-NEXT: vcvttps2dq %xmm6, %xmm6
416 ; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
417 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
418 ; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
419 ; AVX1-NEXT: vpsubw %xmm2, %xmm5, %xmm5
420 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
421 ; AVX1-NEXT: vpslld $23, %xmm7, %xmm7
422 ; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
423 ; AVX1-NEXT: vcvttps2dq %xmm7, %xmm7
424 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
425 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
426 ; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
427 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
428 ; AVX1-NEXT: vpackusdw %xmm7, %xmm4, %xmm4
429 ; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0
430 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
432431 ; AVX1-NEXT: vorps %ymm8, %ymm0, %ymm0
433 ; AVX1-NEXT: vpcmpeqw %xmm10, %xmm3, %xmm3
434 ; AVX1-NEXT: vpcmpeqw %xmm10, %xmm2, %xmm2
432 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
433 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
434 ; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
435435 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
436436 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
437437 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
289289 ; SSE2-LABEL: var_funnnel_v8i16:
290290 ; SSE2: # %bb.0:
291291 ; SSE2-NEXT: pxor %xmm2, %xmm2
292 ; SSE2-NEXT: pxor %xmm3, %xmm3
293 ; SSE2-NEXT: psubw %xmm1, %xmm3
294 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm3
295 ; SSE2-NEXT: movdqa %xmm3, %xmm1
296 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
292 ; SSE2-NEXT: psubw %xmm1, %xmm2
293 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
294 ; SSE2-NEXT: movdqa %xmm2, %xmm1
295 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
297296 ; SSE2-NEXT: pslld $23, %xmm1
298 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
299 ; SSE2-NEXT: paddd %xmm4, %xmm1
297 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
298 ; SSE2-NEXT: paddd %xmm3, %xmm1
300299 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
301300 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
302301 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
303302 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
304 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
305 ; SSE2-NEXT: pslld $23, %xmm3
306 ; SSE2-NEXT: paddd %xmm4, %xmm3
307 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm2
303 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
304 ; SSE2-NEXT: pslld $23, %xmm2
305 ; SSE2-NEXT: paddd %xmm3, %xmm2
306 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
308307 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
309308 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
310309 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
318317 ; SSE41-LABEL: var_funnnel_v8i16:
319318 ; SSE41: # %bb.0:
320319 ; SSE41-NEXT: pxor %xmm2, %xmm2
321 ; SSE41-NEXT: pxor %xmm3, %xmm3
322 ; SSE41-NEXT: psubw %xmm1, %xmm3
323 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
324 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
325 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
326 ; SSE41-NEXT: pslld $23, %xmm3
327 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
328 ; SSE41-NEXT: paddd %xmm2, %xmm3
329 ; SSE41-NEXT: cvttps2dq %xmm3, %xmm3
320 ; SSE41-NEXT: psubw %xmm1, %xmm2
321 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
322 ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
323 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
324 ; SSE41-NEXT: pslld $23, %xmm2
325 ; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
326 ; SSE41-NEXT: paddd %xmm3, %xmm2
327 ; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
330328 ; SSE41-NEXT: pslld $23, %xmm1
331 ; SSE41-NEXT: paddd %xmm2, %xmm1
329 ; SSE41-NEXT: paddd %xmm3, %xmm1
332330 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
333 ; SSE41-NEXT: packusdw %xmm3, %xmm1
331 ; SSE41-NEXT: packusdw %xmm2, %xmm1
334332 ; SSE41-NEXT: movdqa %xmm0, %xmm2
335333 ; SSE41-NEXT: pmulhuw %xmm1, %xmm2
336334 ; SSE41-NEXT: pmullw %xmm1, %xmm0
342340 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
343341 ; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1
344342 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
345 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
343 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
346344 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
347345 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
348346 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
449447 ; X32-SSE-LABEL: var_funnnel_v8i16:
450448 ; X32-SSE: # %bb.0:
451449 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
452 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
453 ; X32-SSE-NEXT: psubw %xmm1, %xmm3
454 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm3
455 ; X32-SSE-NEXT: movdqa %xmm3, %xmm1
456 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
450 ; X32-SSE-NEXT: psubw %xmm1, %xmm2
451 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm2
452 ; X32-SSE-NEXT: movdqa %xmm2, %xmm1
453 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
457454 ; X32-SSE-NEXT: pslld $23, %xmm1
458 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
459 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
455 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
456 ; X32-SSE-NEXT: paddd %xmm3, %xmm1
460457 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
461458 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
462459 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
463460 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
464 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
465 ; X32-SSE-NEXT: pslld $23, %xmm3
466 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
467 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm2
461 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
462 ; X32-SSE-NEXT: pslld $23, %xmm2
463 ; X32-SSE-NEXT: paddd %xmm3, %xmm2
464 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
468465 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
469466 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
470467 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
226226 ; AVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
227227 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15]
228228 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
229 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
229 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
230230 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
231231 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
232232 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
242242 ; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
243243 ; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm1
244244 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
245 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
245 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
246246 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
247247 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
248248 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
261261 ; SSE2-LABEL: var_rotate_v8i16:
262262 ; SSE2: # %bb.0:
263263 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
264 ; SSE2-NEXT: pxor %xmm2, %xmm2
265 ; SSE2-NEXT: movdqa %xmm1, %xmm3
266 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
267 ; SSE2-NEXT: pslld $23, %xmm3
268 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
269 ; SSE2-NEXT: paddd %xmm4, %xmm3
270 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
271 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
272 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
273 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
274 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
264 ; SSE2-NEXT: movdqa %xmm1, %xmm2
265 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
266 ; SSE2-NEXT: pslld $23, %xmm2
267 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
268 ; SSE2-NEXT: paddd %xmm3, %xmm2
269 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
270 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
271 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
272 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
273 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
275274 ; SSE2-NEXT: pslld $23, %xmm1
276 ; SSE2-NEXT: paddd %xmm4, %xmm1
275 ; SSE2-NEXT: paddd %xmm3, %xmm1
277276 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
278277 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
279278 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
280279 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
281 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
280 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
282281 ; SSE2-NEXT: movdqa %xmm0, %xmm2
283282 ; SSE2-NEXT: pmulhuw %xmm1, %xmm2
284283 ; SSE2-NEXT: pmullw %xmm1, %xmm0
402401 ; X32-SSE-LABEL: var_rotate_v8i16:
403402 ; X32-SSE: # %bb.0:
404403 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
405 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
406 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
407 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
408 ; X32-SSE-NEXT: pslld $23, %xmm3
409 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
410 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
411 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
412 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
413 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
414 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
415 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
404 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
405 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
406 ; X32-SSE-NEXT: pslld $23, %xmm2
407 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
408 ; X32-SSE-NEXT: paddd %xmm3, %xmm2
409 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
410 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
411 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
412 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
413 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
416414 ; X32-SSE-NEXT: pslld $23, %xmm1
417 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
415 ; X32-SSE-NEXT: paddd %xmm3, %xmm1
418416 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
419417 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
420418 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
421419 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
422 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
420 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
423421 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
424422 ; X32-SSE-NEXT: pmulhuw %xmm1, %xmm2
425423 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
202202 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
203203 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
204204 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
205 ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
206 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
207 ; AVX1-NEXT: vpslld $23, %xmm5, %xmm5
208 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
209 ; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5
210 ; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5
205 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
206 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
207 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
208 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
209 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
211210 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
212211 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
213 ; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
212 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
214213 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
215 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
216 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
217 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7
218 ; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2
219 ; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2
214 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
215 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
216 ; AVX1-NEXT: vpmulhuw %xmm2, %xmm4, %xmm6
217 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
218 ; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2
220219 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
221 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
220 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
222221 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
223 ; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3
222 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
224223 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
225224 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
226225 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
227 ; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1
226 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
228227 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
229228 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
230229 ; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3
156156 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
157157 ; SSE2-LABEL: var_shift_v8i16:
158158 ; SSE2: # %bb.0:
159 ; SSE2-NEXT: pxor %xmm2, %xmm2
160 ; SSE2-NEXT: movdqa %xmm1, %xmm3
161 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
162 ; SSE2-NEXT: pslld $23, %xmm3
163 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
164 ; SSE2-NEXT: paddd %xmm4, %xmm3
165 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
166 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
167 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
168 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
169 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
159 ; SSE2-NEXT: movdqa %xmm1, %xmm2
160 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
161 ; SSE2-NEXT: pslld $23, %xmm2
162 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
163 ; SSE2-NEXT: paddd %xmm3, %xmm2
164 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
165 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
166 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
167 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
168 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
170169 ; SSE2-NEXT: pslld $23, %xmm1
171 ; SSE2-NEXT: paddd %xmm4, %xmm1
170 ; SSE2-NEXT: paddd %xmm3, %xmm1
172171 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
173172 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
174173 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
175174 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
176 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
175 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
177176 ; SSE2-NEXT: pmullw %xmm1, %xmm0
178177 ; SSE2-NEXT: retq
179178 ;
258257 ;
259258 ; X32-SSE-LABEL: var_shift_v8i16:
260259 ; X32-SSE: # %bb.0:
261 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
262 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
263 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
264 ; X32-SSE-NEXT: pslld $23, %xmm3
265 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
266 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
267 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
268 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
269 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
270 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
271 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
260 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
261 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
262 ; X32-SSE-NEXT: pslld $23, %xmm2
263 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
264 ; X32-SSE-NEXT: paddd %xmm3, %xmm2
265 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
266 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
267 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
268 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
269 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
272270 ; X32-SSE-NEXT: pslld $23, %xmm1
273 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
271 ; X32-SSE-NEXT: paddd %xmm3, %xmm1
274272 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
275273 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
276274 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
277275 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
278 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
276 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
279277 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
280278 ; X32-SSE-NEXT: retl
281279 %shift = shl <8 x i16> %a, %b
157157 ; AVX1-LABEL: var_shift_v16i16:
158158 ; AVX1: # %bb.0:
159159 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
160 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
161 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
162 ; AVX1-NEXT: vpslld $23, %xmm4, %xmm4
163 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
164 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
165 ; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
160 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
161 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
162 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
163 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
164 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
166165 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
167166 ; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
168 ; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
167 ; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
169168 ; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
170 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
171 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
172 ; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
173 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
169 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
170 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
171 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
172 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
174173 ; AVX1-NEXT: vpslld $23, %xmm3, %xmm3
175 ; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
174 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
176175 ; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
177176 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
178177 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
179 ; AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
178 ; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
180179 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
181180 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
182181 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
247246 ; X32-AVX1-LABEL: var_shift_v16i16:
248247 ; X32-AVX1: # %bb.0:
249248 ; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
250 ; X32-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
251 ; X32-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
252 ; X32-AVX1-NEXT: vpslld $23, %xmm4, %xmm4
253 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
254 ; X32-AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
255 ; X32-AVX1-NEXT: vcvttps2dq %xmm4, %xmm4
249 ; X32-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
250 ; X32-AVX1-NEXT: vpslld $23, %xmm3, %xmm3
251 ; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
252 ; X32-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
253 ; X32-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
256254 ; X32-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
257255 ; X32-AVX1-NEXT: vpslld $23, %xmm2, %xmm2
258 ; X32-AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
256 ; X32-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
259257 ; X32-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
260 ; X32-AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
261 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
262 ; X32-AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
263 ; X32-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
258 ; X32-AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
259 ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
260 ; X32-AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
261 ; X32-AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
264262 ; X32-AVX1-NEXT: vpslld $23, %xmm3, %xmm3
265 ; X32-AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
263 ; X32-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
266264 ; X32-AVX1-NEXT: vcvttps2dq %xmm3, %xmm3
267265 ; X32-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
268266 ; X32-AVX1-NEXT: vpslld $23, %xmm1, %xmm1
269 ; X32-AVX1-NEXT: vpaddd %xmm5, %xmm1, %xmm1
267 ; X32-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
270268 ; X32-AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
271269 ; X32-AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
272270 ; X32-AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
9292 define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
9393 ; SSE2-LABEL: var_shift_v4i16:
9494 ; SSE2: # %bb.0:
95 ; SSE2-NEXT: pxor %xmm2, %xmm2
96 ; SSE2-NEXT: movdqa %xmm1, %xmm3
97 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
98 ; SSE2-NEXT: pslld $23, %xmm3
99 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
100 ; SSE2-NEXT: paddd %xmm4, %xmm3
101 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
102 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
103 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
104 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
105 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
95 ; SSE2-NEXT: movdqa %xmm1, %xmm2
96 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
97 ; SSE2-NEXT: pslld $23, %xmm2
98 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
99 ; SSE2-NEXT: paddd %xmm3, %xmm2
100 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
101 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
102 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
103 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
104 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
106105 ; SSE2-NEXT: pslld $23, %xmm1
107 ; SSE2-NEXT: paddd %xmm4, %xmm1
106 ; SSE2-NEXT: paddd %xmm3, %xmm1
108107 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
109108 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
110109 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
111110 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
112 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
111 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
113112 ; SSE2-NEXT: pmullw %xmm1, %xmm0
114113 ; SSE2-NEXT: retq
115114 ;
194193 ;
195194 ; X32-SSE-LABEL: var_shift_v4i16:
196195 ; X32-SSE: # %bb.0:
197 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
198 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
199 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
200 ; X32-SSE-NEXT: pslld $23, %xmm3
201 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
202 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
203 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
204 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
205 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
206 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
207 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
196 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
197 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
198 ; X32-SSE-NEXT: pslld $23, %xmm2
199 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
200 ; X32-SSE-NEXT: paddd %xmm3, %xmm2
201 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
202 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
203 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
204 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
205 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
208206 ; X32-SSE-NEXT: pslld $23, %xmm1
209 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
207 ; X32-SSE-NEXT: paddd %xmm3, %xmm1
210208 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
211209 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
212210 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
213211 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
214 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
212 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
215213 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
216214 ; X32-SSE-NEXT: retl
217215 %shift = shl <4 x i16> %a, %b
221219 define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
222220 ; SSE2-LABEL: var_shift_v2i16:
223221 ; SSE2: # %bb.0:
224 ; SSE2-NEXT: pxor %xmm2, %xmm2
225 ; SSE2-NEXT: movdqa %xmm1, %xmm3
226 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
227 ; SSE2-NEXT: pslld $23, %xmm3
228 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
229 ; SSE2-NEXT: paddd %xmm4, %xmm3
230 ; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
231 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
232 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
233 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
234 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
222 ; SSE2-NEXT: movdqa %xmm1, %xmm2
223 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
224 ; SSE2-NEXT: pslld $23, %xmm2
225 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
226 ; SSE2-NEXT: paddd %xmm3, %xmm2
227 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
228 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
229 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
230 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
231 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
235232 ; SSE2-NEXT: pslld $23, %xmm1
236 ; SSE2-NEXT: paddd %xmm4, %xmm1
233 ; SSE2-NEXT: paddd %xmm3, %xmm1
237234 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
238235 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
239236 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
240237 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
241 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
238 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
242239 ; SSE2-NEXT: pmullw %xmm1, %xmm0
243240 ; SSE2-NEXT: retq
244241 ;
323320 ;
324321 ; X32-SSE-LABEL: var_shift_v2i16:
325322 ; X32-SSE: # %bb.0:
326 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
327 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
328 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
329 ; X32-SSE-NEXT: pslld $23, %xmm3
330 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
331 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
332 ; X32-SSE-NEXT: cvttps2dq %xmm3, %xmm3
333 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
334 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
335 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
336 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
323 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
324 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
325 ; X32-SSE-NEXT: pslld $23, %xmm2
326 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
327 ; X32-SSE-NEXT: paddd %xmm3, %xmm2
328 ; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
329 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
330 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
331 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
332 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
337333 ; X32-SSE-NEXT: pslld $23, %xmm1
338 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
334 ; X32-SSE-NEXT: paddd %xmm3, %xmm1
339335 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
340336 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
341337 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
342338 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
343 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
339 ; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
344340 ; X32-SSE-NEXT: pmullw %xmm1, %xmm0
345341 ; X32-SSE-NEXT: retl
346342 %shift = shl <2 x i16> %a, %b
264264 define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
265265 ; SSE2-LABEL: var_shift_v8i8:
266266 ; SSE2: # %bb.0:
267 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
268267 ; SSE2-NEXT: pxor %xmm2, %xmm2
269268 ; SSE2-NEXT: movdqa %xmm1, %xmm3
270 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
269 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
271270 ; SSE2-NEXT: pslld $23, %xmm3
272271 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
273272 ; SSE2-NEXT: paddd %xmm4, %xmm3
275274 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
276275 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
277276 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
278 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
277 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
279278 ; SSE2-NEXT: pslld $23, %xmm1
280279 ; SSE2-NEXT: paddd %xmm4, %xmm1
281280 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
376375 ;
377376 ; X32-SSE-LABEL: var_shift_v8i8:
378377 ; X32-SSE: # %bb.0:
379 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
380378 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
381379 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
382 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
380 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
383381 ; X32-SSE-NEXT: pslld $23, %xmm3
384382 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
385383 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
387385 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
388386 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
389387 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
390 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
388 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
391389 ; X32-SSE-NEXT: pslld $23, %xmm1
392390 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
393391 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
839837 ; SSE2: # %bb.0:
840838 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
841839 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
842 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
843840 ; SSE2-NEXT: pxor %xmm2, %xmm2
844841 ; SSE2-NEXT: movdqa %xmm1, %xmm3
845 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
842 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
846843 ; SSE2-NEXT: pslld $23, %xmm3
847844 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
848845 ; SSE2-NEXT: paddd %xmm4, %xmm3
850847 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
851848 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
852849 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
853 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
850 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
854851 ; SSE2-NEXT: pslld $23, %xmm1
855852 ; SSE2-NEXT: paddd %xmm4, %xmm1
856853 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
951948 ; X32-SSE: # %bb.0:
952949 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
953950 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
954 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
955951 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
956952 ; X32-SSE-NEXT: movdqa %xmm1, %xmm3
957 ; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
953 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
958954 ; X32-SSE-NEXT: pslld $23, %xmm3
959955 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
960956 ; X32-SSE-NEXT: paddd %xmm4, %xmm3
962958 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
963959 ; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
964960 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
965 ; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
961 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
966962 ; X32-SSE-NEXT: pslld $23, %xmm1
967963 ; X32-SSE-NEXT: paddd %xmm4, %xmm1
968964 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1