llvm.org GIT mirror llvm / f3303c9
[AVX512] Add popcount support for v32i16 and v64i8. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@266858 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 3 years ago
2 changed file(s) with 72 addition(s) and 44 deletion(s). Raw diff Collapse all Expand all
15271527 setOperationAction(ISD::SRA, VT, Custom);
15281528 setOperationAction(ISD::MLOAD, VT, Legal);
15291529 setOperationAction(ISD::MSTORE, VT, Legal);
1530 setOperationAction(ISD::CTPOP, VT, Custom);
15301531
15311532 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
15321533 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
2057920580 int NumByteElts = VecSize / 8;
2058020581 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
2058120582 SDValue In = DAG.getBitcast(ByteVecVT, Op);
20582 SmallVector16> LUTVec;
20583 SmallVector64> LUTVec;
2058320584 for (int i = 0; i < NumByteElts; ++i)
2058420585 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
2058520586 SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec);
2067520676 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
2067620677 SelectionDAG &DAG) {
2067720678 MVT VT = Op.getSimpleValueType();
20678 // FIXME: Need to add AVX-512 support here!
20679 assert((VT.is256BitVector() || VT.is128BitVector()) &&
20679 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
2068020680 "Unknown CTPOP type to handle");
2068120681 SDLoc DL(Op.getNode());
2068220682 SDValue Op0 = Op.getOperand(0);
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD
1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
23
34 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
45 ; ALL-LABEL: testv8i64:
105106 }
106107
107108 define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
108 ; ALL-LABEL: testv32i16:
109 ; ALL: ## BB#0:
110 ; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
111 ; ALL-NEXT: vpand %ymm2, %ymm0, %ymm3
112 ; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
113 ; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
114 ; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
115 ; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
116 ; ALL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
117 ; ALL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
118 ; ALL-NEXT: vpsllw $8, %ymm0, %ymm3
119 ; ALL-NEXT: vpaddb %ymm0, %ymm3, %ymm0
120 ; ALL-NEXT: vpsrlw $8, %ymm0, %ymm0
121 ; ALL-NEXT: vpand %ymm2, %ymm1, %ymm3
122 ; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
123 ; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
124 ; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
125 ; ALL-NEXT: vpshufb %ymm1, %ymm4, %ymm1
126 ; ALL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
127 ; ALL-NEXT: vpsllw $8, %ymm1, %ymm2
128 ; ALL-NEXT: vpaddb %ymm1, %ymm2, %ymm1
129 ; ALL-NEXT: vpsrlw $8, %ymm1, %ymm1
130 ; ALL-NEXT: retq
109 ; AVX512F-LABEL: testv32i16:
110 ; AVX512F: ## BB#0:
111 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
112 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
113 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
114 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
115 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
116 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
117 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
118 ; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
119 ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm3
120 ; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0
121 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
122 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
123 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
124 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
125 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
126 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
127 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
128 ; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm2
129 ; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
130 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
131 ; AVX512F-NEXT: retq
132 ;
133 ; AVX512BW-LABEL: testv32i16:
134 ; AVX512BW: ## BB#0:
135 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
136 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
137 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
138 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
139 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
140 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
141 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
142 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
143 ; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
144 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
145 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
146 ; AVX512BW-NEXT: retq
131147 %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
132148 ret <32 x i16> %out
133149 }
134150
135151 define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
136 ; ALL-LABEL: testv64i8:
137 ; ALL: ## BB#0:
138 ; ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
139 ; ALL-NEXT: vpand %ymm2, %ymm0, %ymm3
140 ; ALL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
141 ; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
142 ; ALL-NEXT: vpsrlw $4, %ymm0, %ymm0
143 ; ALL-NEXT: vpand %ymm2, %ymm0, %ymm0
144 ; ALL-NEXT: vpshufb %ymm0, %ymm4, %ymm0
145 ; ALL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
146 ; ALL-NEXT: vpand %ymm2, %ymm1, %ymm3
147 ; ALL-NEXT: vpshufb %ymm3, %ymm4, %ymm3
148 ; ALL-NEXT: vpsrlw $4, %ymm1, %ymm1
149 ; ALL-NEXT: vpand %ymm2, %ymm1, %ymm1
150 ; ALL-NEXT: vpshufb %ymm1, %ymm4, %ymm1
151 ; ALL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
152 ; ALL-NEXT: retq
152 ; AVX512F-LABEL: testv64i8:
153 ; AVX512F: ## BB#0:
154 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
155 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
156 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
157 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
158 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
159 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
160 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
161 ; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
162 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
163 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
164 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
165 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
166 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
167 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
168 ; AVX512F-NEXT: retq
169 ;
170 ; AVX512BW-LABEL: testv64i8:
171 ; AVX512BW: ## BB#0:
172 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
173 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
174 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
175 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
176 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
177 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
178 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
179 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
180 ; AVX512BW-NEXT: retq
153181 %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
154182 ret <64 x i8> %out
155183 }