llvm.org GIT mirror llvm / ff27ba2
[X86] Use PSADBW for v8i8 addition reductions. Improves the 8 byte case from PR42674. Differential Revision: https://reviews.llvm.org/D66069 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@368864 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 2 months ago
2 changed file(s) with 41 addition(s) and 105 deletion(s). Raw diff Collapse all Expand all
3543935439 if (VecVT.getScalarType() != VT)
3544035440 return SDValue();
3544135441
35442 SDLoc DL(ExtElt);
35443
35444 if (VecVT == MVT::v8i8) {
35445 // Pad with undef.
35446 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
35447 DAG.getUNDEF(VecVT));
35448 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
35449 DAG.getConstant(0, DL, MVT::v16i8));
35450 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
35451 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
35452 }
35453
3544235454 // Must be a >=128-bit vector with pow2 elements.
3544335455 if ((VecVT.getSizeInBits() % 128) != 0 ||
3544435456 !isPowerOf2_32(VecVT.getVectorNumElements()))
3544535457 return SDValue();
35446
35447 SDLoc DL(ExtElt);
3544835458
3544935459 // vXi8 reduction - sum lo/hi halves then use PSADBW.
3545035460 if (VT == MVT::i8) {
11591159 define i8 @test_v8i8(<8 x i8> %a0) {
11601160 ; SSE2-LABEL: test_v8i8:
11611161 ; SSE2: # %bb.0:
1162 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1163 ; SSE2-NEXT: paddb %xmm0, %xmm1
1164 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1165 ; SSE2-NEXT: psrld $16, %xmm0
1166 ; SSE2-NEXT: paddb %xmm1, %xmm0
1167 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1168 ; SSE2-NEXT: psrlw $8, %xmm1
1169 ; SSE2-NEXT: paddb %xmm0, %xmm1
1162 ; SSE2-NEXT: pxor %xmm1, %xmm1
1163 ; SSE2-NEXT: psadbw %xmm0, %xmm1
11701164 ; SSE2-NEXT: movd %xmm1, %eax
11711165 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
11721166 ; SSE2-NEXT: retq
11731167 ;
11741168 ; SSE41-LABEL: test_v8i8:
11751169 ; SSE41: # %bb.0:
1176 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1177 ; SSE41-NEXT: paddb %xmm0, %xmm1
1178 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1179 ; SSE41-NEXT: psrld $16, %xmm0
1180 ; SSE41-NEXT: paddb %xmm1, %xmm0
1181 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1182 ; SSE41-NEXT: psrlw $8, %xmm1
1183 ; SSE41-NEXT: paddb %xmm0, %xmm1
1170 ; SSE41-NEXT: pxor %xmm1, %xmm1
1171 ; SSE41-NEXT: psadbw %xmm0, %xmm1
11841172 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
11851173 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
11861174 ; SSE41-NEXT: retq
11871175 ;
11881176 ; AVX-LABEL: test_v8i8:
11891177 ; AVX: # %bb.0:
1190 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1191 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1192 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
1193 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1194 ; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
1195 ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1178 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1179 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
11961180 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
11971181 ; AVX-NEXT: # kill: def $al killed $al killed $eax
11981182 ; AVX-NEXT: retq
11991183 ;
12001184 ; AVX512-LABEL: test_v8i8:
12011185 ; AVX512: # %bb.0:
1202 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1203 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1204 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
1205 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1206 ; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
1207 ; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1186 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1187 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
12081188 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
12091189 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
12101190 ; AVX512-NEXT: retq
12161196 ; SSE2-LABEL: test_v8i8_load:
12171197 ; SSE2: # %bb.0:
12181198 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1219 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1220 ; SSE2-NEXT: paddb %xmm0, %xmm1
1221 ; SSE2-NEXT: movdqa %xmm1, %xmm0
1222 ; SSE2-NEXT: psrld $16, %xmm0
1223 ; SSE2-NEXT: paddb %xmm1, %xmm0
1224 ; SSE2-NEXT: movdqa %xmm0, %xmm1
1225 ; SSE2-NEXT: psrlw $8, %xmm1
1226 ; SSE2-NEXT: paddb %xmm0, %xmm1
1199 ; SSE2-NEXT: pxor %xmm1, %xmm1
1200 ; SSE2-NEXT: psadbw %xmm0, %xmm1
12271201 ; SSE2-NEXT: movd %xmm1, %eax
12281202 ; SSE2-NEXT: # kill: def $al killed $al killed $eax
12291203 ; SSE2-NEXT: retq
12311205 ; SSE41-LABEL: test_v8i8_load:
12321206 ; SSE41: # %bb.0:
12331207 ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1234 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1235 ; SSE41-NEXT: paddb %xmm0, %xmm1
1236 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1237 ; SSE41-NEXT: psrld $16, %xmm0
1238 ; SSE41-NEXT: paddb %xmm1, %xmm0
1239 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1240 ; SSE41-NEXT: psrlw $8, %xmm1
1241 ; SSE41-NEXT: paddb %xmm0, %xmm1
1208 ; SSE41-NEXT: pxor %xmm1, %xmm1
1209 ; SSE41-NEXT: psadbw %xmm0, %xmm1
12421210 ; SSE41-NEXT: pextrb $0, %xmm1, %eax
12431211 ; SSE41-NEXT: # kill: def $al killed $al killed $eax
12441212 ; SSE41-NEXT: retq
12451213 ;
1246 ; AVX1-LABEL: test_v8i8_load:
1247 ; AVX1: # %bb.0:
1248 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1249 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
1250 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1251 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
1252 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1253 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
1254 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1255 ; AVX1-NEXT: vpextrb $0, %xmm0, %eax
1256 ; AVX1-NEXT: # kill: def $al killed $al killed $eax
1257 ; AVX1-NEXT: retq
1258 ;
1259 ; AVX2-LABEL: test_v8i8_load:
1260 ; AVX2: # %bb.0:
1261 ; AVX2-NEXT: movq (%rdi), %rax
1262 ; AVX2-NEXT: vmovq %rax, %xmm0
1263 ; AVX2-NEXT: shrq $32, %rax
1264 ; AVX2-NEXT: vmovd %eax, %xmm1
1265 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
1266 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1267 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
1268 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1269 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
1270 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1271 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax
1272 ; AVX2-NEXT: # kill: def $al killed $al killed $eax
1273 ; AVX2-NEXT: retq
1274 ;
1275 ; AVX512BW-LABEL: test_v8i8_load:
1276 ; AVX512BW: # %bb.0:
1277 ; AVX512BW-NEXT: movq (%rdi), %rax
1278 ; AVX512BW-NEXT: vmovq %rax, %xmm0
1279 ; AVX512BW-NEXT: shrq $32, %rax
1280 ; AVX512BW-NEXT: vmovd %eax, %xmm1
1281 ; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
1282 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1283 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
1284 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1285 ; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
1286 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1287 ; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
1288 ; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
1289 ; AVX512BW-NEXT: retq
1290 ;
1291 ; AVX512VL-LABEL: test_v8i8_load:
1292 ; AVX512VL: # %bb.0:
1293 ; AVX512VL-NEXT: movq (%rdi), %rax
1294 ; AVX512VL-NEXT: vmovq %rax, %xmm0
1295 ; AVX512VL-NEXT: shrq $32, %rax
1296 ; AVX512VL-NEXT: vpbroadcastd %eax, %xmm1
1297 ; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1298 ; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm1
1299 ; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1300 ; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
1301 ; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
1302 ; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
1303 ; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
1304 ; AVX512VL-NEXT: retq
1214 ; AVX-LABEL: test_v8i8_load:
1215 ; AVX: # %bb.0:
1216 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1217 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
1218 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1219 ; AVX-NEXT: vpextrb $0, %xmm0, %eax
1220 ; AVX-NEXT: # kill: def $al killed $al killed $eax
1221 ; AVX-NEXT: retq
1222 ;
1223 ; AVX512-LABEL: test_v8i8_load:
1224 ; AVX512: # %bb.0:
1225 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1226 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
1227 ; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1228 ; AVX512-NEXT: vpextrb $0, %xmm0, %eax
1229 ; AVX512-NEXT: # kill: def $al killed $al killed $eax
1230 ; AVX512-NEXT: retq
13051231 %a0 = load <8 x i8>, <8 x i8>* %p
13061232 %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
13071233 ret i8 %1