llvm.org GIT mirror llvm / c4b3565
[x86] Fix a failure to select with AVX-512 when the type legalizer manages to form a VSELECT with a non-i1 element type condition. Those are technically allowed in SDAG (at least, the generic type legalization logic will form them and I wouldn't want to try to audit everything te preclude forming them) so we need to be able to lower them. This isn't too hard to implement. We mark VSELECT as custom so we get a chance in C++, add a fast path for i1 conditions to get directly handled by the patterns, and a fallback when we need to manually force the condition to be an i1 that uses the vptestm instruction to turn a non-mask into a mask. This, unsurprisingly, generates awful code. But it at least doesn't crash. This was actually impacting open source packages built with LLVM for AVX-512 in the wild, so quickly landing a patch that at least stops the immediate bleeding. I think I've found where to fix the codegen quality issue, but less confident of that change so separating it out from the thing that doesn't change the result of any existing test case but causes mine to not crash. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@302785 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 3 years ago
2 changed file(s) with 90 addition(s) and 5 deletion(s). Raw diff Collapse all Expand all
13801380 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
13811381 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
13821382 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1383 setOperationAction(ISD::VSELECT, VT, Legal);
1383 setOperationAction(ISD::VSELECT, VT, Custom);
13841384 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
13851385 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
13861386 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
14441444 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
14451445 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
14461446 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1447 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1448 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
14491447 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
14501448 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
14511449 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
14781476
14791477 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
14801478 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1481 setOperationAction(ISD::VSELECT, VT, Legal);
1479 setOperationAction(ISD::VSELECT, VT, Custom);
14821480 setOperationAction(ISD::ABS, VT, Legal);
14831481 setOperationAction(ISD::SRL, VT, Custom);
14841482 setOperationAction(ISD::SHL, VT, Custom);
1381613814 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
1381713815 return SDValue();
1381813816
13817 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
13818 // with patterns on the mask registers on AVX-512.
13819 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
13820 return Op;
13821
1381913822 // Try to lower this to a blend-style vector shuffle. This can handle all
1382013823 // constant condition cases.
1382113824 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
1382513828 if (!Subtarget.hasSSE41())
1382613829 return SDValue();
1382713830
13831 SDLoc dl(Op);
13832 MVT VT = Op.getSimpleValueType();
13833
13834 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
13835 // into an i1 condition so that we can use the mask-based 512-bit blend
13836 // instructions.
13837 if (VT.getSizeInBits() == 512) {
13838 SDValue Cond = Op.getOperand(0);
13839 // The vNi1 condition case should be handled above as it can be trivially
13840 // lowered.
13841 assert(Cond.getValueType().getScalarSizeInBits() ==
13842 VT.getScalarSizeInBits() &&
13843 "Should have a size-matched integer condition!");
13844 // Build a mask by testing the condition against itself (tests for zero).
13845 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
13846 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
13847 // Now return a new VSELECT using the mask.
13848 return DAG.getNode(ISD::VSELECT, dl, VT, Mask, Op.getOperand(1),
13849 Op.getOperand(2));
13850 }
13851
1382813852 // Only some types will be legal on some subtargets. If we can emit a legal
1382913853 // VSELECT-matching blend, return Op, and but if we need to expand, return
1383013854 // a null value.
13831 switch (Op.getSimpleValueType().SimpleTy) {
13855 switch (VT.SimpleTy) {
1383213856 default:
1383313857 // Most of the vector types have blends past SSE4.1.
1383413858 return Op;
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mcpu=skx | FileCheck %s --check-prefixes=CHECK,CHECK-SKX
2 ; RUN: llc < %s -mcpu=knl | FileCheck %s --check-prefixes=CHECK,CHECK-KNL
3
4 target triple = "x86_64-unknown-unknown"
5
6 define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) {
7 ; CHECK-LABEL: test1:
8 ; CHECK: # BB#0: # %entry
9 ; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0
10 ; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1
11 ; CHECK-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
12 ; CHECK-NEXT: retq
13 entry:
14 %m.trunc = trunc <8 x i64> %m to <8 x i1>
15 %ret = select <8 x i1> %m.trunc, <8 x i64> %a, <8 x i64> %b
16 ret <8 x i64> %ret
17 }
18
19 ; This is a very contrived test case to trick the legalizer into splitting the
20 ; v16i1 masks in the select during type legalization, and in so doing extend them
21 ; into two v8i64 types. This lets us ensure that the lowering code can handle
22 ; both formulations of vselect. All of this trickery is because we can't
23 ; directly form an SDAG input to the lowering.
24 define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) {
25 ; CHECK-SKX-LABEL: test2:
26 ; CHECK-SKX: # BB#0: # %entry
27 ; CHECK-SKX-NEXT: vxorps %zmm6, %zmm6, %zmm6
28 ; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0
29 ; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1
30 ; CHECK-SKX-NEXT: korw %k1, %k0, %k0
31 ; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1
32 ; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1
33 ; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0
34 ; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1
35 ; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
36 ; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1
37 ; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
38 ; CHECK-SKX-NEXT: retq
39 ;
40 ; CHECK-KNL-LABEL: test2:
41 ; CHECK-KNL: # BB#0: # %entry
42 ; CHECK-KNL-NEXT: vpxord %zmm6, %zmm6, %zmm6
43 ; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0
44 ; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1
45 ; CHECK-KNL-NEXT: korw %k1, %k0, %k1
46 ; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2
47 ; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
48 ; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
49 ; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
50 ; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
51 ; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
52 ; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
53 ; CHECK-KNL-NEXT: retq
54 entry:
55 %gt.m = fcmp ogt <16 x float> %x, zeroinitializer
56 %lt.m = fcmp olt <16 x float> %y, zeroinitializer
57 %m.or = or <16 x i1> %gt.m, %lt.m
58 %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b
59 ret <16 x double> %ret
60 }