llvm.org GIT mirror llvm / 5728970
[CostModel][X86] Add avx2 two-src shuffle costs git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310645 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
3 changed file(s) with 45 addition(s) and 36 deletion(s). Raw diff Collapse all Expand all
841841 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
842842 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
843843 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
844 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
844 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
845845 // + vpblendvb
846 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 } // vperm2i128 + 2 * vpshufb
846 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb
847 // + vpblendvb
848
849 { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd
850 { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps
851 { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd
852 { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd
853 { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
854 // + vpblendvb
855 { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb
847856 // + vpblendvb
848857 };
849858
3333 ; SSSE3: cost of 12 {{.*}} %V512 = shufflevector
3434 ; SSE42: cost of 12 {{.*}} %V512 = shufflevector
3535 ; AVX1: cost of 12 {{.*}} %V512 = shufflevector
36 ; AVX2: cost of 12 {{.*}} %V512 = shufflevector
36 ; AVX2: cost of 6 {{.*}} %V512 = shufflevector
3737 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
3838 %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32>
3939
4141 ; SSSE3: cost of 56 {{.*}} %V1024 = shufflevector
4242 ; SSE42: cost of 56 {{.*}} %V1024 = shufflevector
4343 ; AVX1: cost of 72 {{.*}} %V1024 = shufflevector
44 ; AVX2: cost of 72 {{.*}} %V1024 = shufflevector
44 ; AVX2: cost of 36 {{.*}} %V1024 = shufflevector
4545 ; AVX512: cost of 2 {{.*}} %V1024 = shufflevector
4646 %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32>
4747
7171 ; SSSE3: cost of 12 {{.*}} %V512 = shufflevector
7272 ; SSE42: cost of 12 {{.*}} %V512 = shufflevector
7373 ; AVX1: cost of 16 {{.*}} %V512 = shufflevector
74 ; AVX2: cost of 16 {{.*}} %V512 = shufflevector
74 ; AVX2: cost of 6 {{.*}} %V512 = shufflevector
7575 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
7676 %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32>
7777
101101 ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
102102 ; SSE42: cost of 24 {{.*}} %V512 = shufflevector
103103 ; AVX1: cost of 28 {{.*}} %V512 = shufflevector
104 ; AVX2: cost of 28 {{.*}} %V512 = shufflevector
104 ; AVX2: cost of 6 {{.*}} %V512 = shufflevector
105105 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
106106 %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32>
107107
131131 ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
132132 ; SSE42: cost of 24 {{.*}} %V512 = shufflevector
133133 ; AVX1: cost of 32 {{.*}} %V512 = shufflevector
134 ; AVX2: cost of 32 {{.*}} %V512 = shufflevector
134 ; AVX2: cost of 6 {{.*}} %V512 = shufflevector
135135 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
136136 %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32>
137137
139139 ; SSSE3: cost of 112 {{.*}} %V1024 = shufflevector
140140 ; SSE42: cost of 112 {{.*}} %V1024 = shufflevector
141141 ; AVX1: cost of 192 {{.*}} %V1024 = shufflevector
142 ; AVX2: cost of 192 {{.*}} %V1024 = shufflevector
142 ; AVX2: cost of 36 {{.*}} %V1024 = shufflevector
143143 ; AVX512: cost of 2 {{.*}} %V1024 = shufflevector
144144 %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32>
145145 ret void
172172 ; SSSE3: cost of 36 {{.*}} %V512 = shufflevector
173173 ; SSE42: cost of 36 {{.*}} %V512 = shufflevector
174174 ; AVX1: cost of 64 {{.*}} %V512 = shufflevector
175 ; AVX2: cost of 64 {{.*}} %V512 = shufflevector
176 ; AVX512F: cost of 64 {{.*}} %V512 = shufflevector
175 ; AVX2: cost of 14 {{.*}} %V512 = shufflevector
176 ; AVX512F: cost of 14 {{.*}} %V512 = shufflevector
177177 ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
178178 ; AVX512VBMI: cost of 1 {{.*}} %V512 = shufflevector
179179 %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32>
182182 ; SSSE3: cost of 168 {{.*}} %V1024 = shufflevector
183183 ; SSE42: cost of 168 {{.*}} %V1024 = shufflevector
184184 ; AVX1: cost of 384 {{.*}} %V1024 = shufflevector
185 ; AVX2: cost of 384 {{.*}} %V1024 = shufflevector
186 ; AVX512F: cost of 384 {{.*}} %V1024 = shufflevector
185 ; AVX2: cost of 84 {{.*}} %V1024 = shufflevector
186 ; AVX512F: cost of 84 {{.*}} %V1024 = shufflevector
187187 ; AVX512BW: cost of 2 {{.*}} %V1024 = shufflevector
188188 ; AVX512VBMI: cost of 2 {{.*}} %V1024 = shufflevector
189189 %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32>
214214 ; SSSE3: cost of 36 {{.*}} %V512 = shufflevector
215215 ; SSE42: cost of 36 {{.*}} %V512 = shufflevector
216216 ; AVX1: cost of 128 {{.*}} %V512 = shufflevector
217 ; AVX2: cost of 128 {{.*}} %V512 = shufflevector
218 ; AVX512F: cost of 128 {{.*}} %V512 = shufflevector
217 ; AVX2: cost of 14 {{.*}} %V512 = shufflevector
218 ; AVX512F: cost of 14 {{.*}} %V512 = shufflevector
219219 ; AVX512BW: cost of 8 {{.*}} %V512 = shufflevector
220220 ; AVX512VBMI: cost of 1 {{.*}} %V512 = shufflevector
221221 %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32>
2525 ; SSSE3: cost of 6 {{.*}} %V256 = shufflevector
2626 ; SSE42: cost of 6 {{.*}} %V256 = shufflevector
2727 ; AVX1: cost of 6 {{.*}} %V256 = shufflevector
28 ; AVX2: cost of 6 {{.*}} %V256 = shufflevector
28 ; AVX2: cost of 3 {{.*}} %V256 = shufflevector
2929 ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
3030 %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32>
3131
3333 ; SSSE3: cost of 28 {{.*}} %V512 = shufflevector
3434 ; SSE42: cost of 28 {{.*}} %V512 = shufflevector
3535 ; AVX1: cost of 12 {{.*}} %V512 = shufflevector
36 ; AVX2: cost of 12 {{.*}} %V512 = shufflevector
36 ; AVX2: cost of 18 {{.*}} %V512 = shufflevector
3737 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
3838 %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32>
3939
4141 ; SSSE3: cost of 120 {{.*}} %V1024 = shufflevector
4242 ; SSE42: cost of 120 {{.*}} %V1024 = shufflevector
4343 ; AVX1: cost of 24 {{.*}} %V1024 = shufflevector
44 ; AVX2: cost of 24 {{.*}} %V1024 = shufflevector
44 ; AVX2: cost of 84 {{.*}} %V1024 = shufflevector
4545 ; AVX512: cost of 6 {{.*}} %V1024 = shufflevector
4646 %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32>
4747
6363 ; SSSE3: cost of 6 {{.*}} %V256 = shufflevector
6464 ; SSE42: cost of 6 {{.*}} %V256 = shufflevector
6565 ; AVX1: cost of 8 {{.*}} %V256 = shufflevector
66 ; AVX2: cost of 8 {{.*}} %V256 = shufflevector
66 ; AVX2: cost of 3 {{.*}} %V256 = shufflevector
6767 ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
6868 %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32>
6969
7171 ; SSSE3: cost of 28 {{.*}} %V512 = shufflevector
7272 ; SSE42: cost of 28 {{.*}} %V512 = shufflevector
7373 ; AVX1: cost of 16 {{.*}} %V512 = shufflevector
74 ; AVX2: cost of 16 {{.*}} %V512 = shufflevector
74 ; AVX2: cost of 18 {{.*}} %V512 = shufflevector
7575 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
7676 %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32>
7777
7979 ; SSSE3: cost of 120 {{.*}} %V1024 = shufflevector
8080 ; SSE42: cost of 120 {{.*}} %V1024 = shufflevector
8181 ; AVX1: cost of 32 {{.*}} %V1024 = shufflevector
82 ; AVX2: cost of 32 {{.*}} %V1024 = shufflevector
82 ; AVX2: cost of 84 {{.*}} %V1024 = shufflevector
8383 ; AVX512: cost of 6 {{.*}} %V1024 = shufflevector
8484 %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32>
8585
101101 ; SSSE3: cost of 12 {{.*}} %V256 = shufflevector
102102 ; SSE42: cost of 12 {{.*}} %V256 = shufflevector
103103 ; AVX1: cost of 14 {{.*}} %V256 = shufflevector
104 ; AVX2: cost of 14 {{.*}} %V256 = shufflevector
104 ; AVX2: cost of 3 {{.*}} %V256 = shufflevector
105105 ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
106106 %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32>
107107
109109 ; SSSE3: cost of 56 {{.*}} %V512 = shufflevector
110110 ; SSE42: cost of 56 {{.*}} %V512 = shufflevector
111111 ; AVX1: cost of 28 {{.*}} %V512 = shufflevector
112 ; AVX2: cost of 28 {{.*}} %V512 = shufflevector
112 ; AVX2: cost of 18 {{.*}} %V512 = shufflevector
113113 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
114114 %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32>
115115
117117 ; SSSE3: cost of 240 {{.*}} %V1024 = shufflevector
118118 ; SSE42: cost of 240 {{.*}} %V1024 = shufflevector
119119 ; AVX1: cost of 56 {{.*}} %V1024 = shufflevector
120 ; AVX2: cost of 56 {{.*}} %V1024 = shufflevector
120 ; AVX2: cost of 84 {{.*}} %V1024 = shufflevector
121121 ; AVX512: cost of 6 {{.*}} %V1024 = shufflevector
122122 %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32>
123123
139139 ; SSSE3: cost of 12 {{.*}} %V256 = shufflevector
140140 ; SSE42: cost of 12 {{.*}} %V256 = shufflevector
141141 ; AVX1: cost of 16 {{.*}} %V256 = shufflevector
142 ; AVX2: cost of 16 {{.*}} %V256 = shufflevector
142 ; AVX2: cost of 3 {{.*}} %V256 = shufflevector
143143 ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
144144 %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32>
145145
147147 ; SSSE3: cost of 56 {{.*}} %V512 = shufflevector
148148 ; SSE42: cost of 56 {{.*}} %V512 = shufflevector
149149 ; AVX1: cost of 32 {{.*}} %V512 = shufflevector
150 ; AVX2: cost of 32 {{.*}} %V512 = shufflevector
150 ; AVX2: cost of 18 {{.*}} %V512 = shufflevector
151151 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
152152 %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32>
153153
155155 ; SSSE3: cost of 240 {{.*}} %V1024 = shufflevector
156156 ; SSE42: cost of 240 {{.*}} %V1024 = shufflevector
157157 ; AVX1: cost of 64 {{.*}} %V1024 = shufflevector
158 ; AVX2: cost of 64 {{.*}} %V1024 = shufflevector
158 ; AVX2: cost of 84 {{.*}} %V1024 = shufflevector
159159 ; AVX512: cost of 6 {{.*}} %V1024 = shufflevector
160160 %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32>
161161
179179 ; SSSE3: cost of 18 {{.*}} %V256 = shufflevector
180180 ; SSE42: cost of 18 {{.*}} %V256 = shufflevector
181181 ; AVX1: cost of 32 {{.*}} %V256 = shufflevector
182 ; AVX2: cost of 32 {{.*}} %V256 = shufflevector
183 ; AVX512F: cost of 32 {{.*}} %V256 = shufflevector
182 ; AVX2: cost of 7 {{.*}} %V256 = shufflevector
183 ; AVX512F: cost of 7 {{.*}} %V256 = shufflevector
184184 ; AVX512BW: cost of 1 {{.*}} %V256 = shufflevector
185185 ; AVX512VBMI: cost of 1 {{.*}} %V256 = shufflevector
186186 %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32>
189189 ; SSSE3: cost of 84 {{.*}} %V512 = shufflevector
190190 ; SSE42: cost of 84 {{.*}} %V512 = shufflevector
191191 ; AVX1: cost of 64 {{.*}} %V512 = shufflevector
192 ; AVX2: cost of 64 {{.*}} %V512 = shufflevector
193 ; AVX512F: cost of 64 {{.*}} %V512 = shufflevector
192 ; AVX2: cost of 42 {{.*}} %V512 = shufflevector
193 ; AVX512F: cost of 42 {{.*}} %V512 = shufflevector
194194 ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
195195 ; AVX512VBMI: cost of 1 {{.*}} %V512 = shufflevector
196196 %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32>
199199 ; SSSE3: cost of 360 {{.*}} %V1024 = shufflevector
200200 ; SSE42: cost of 360 {{.*}} %V1024 = shufflevector
201201 ; AVX1: cost of 128 {{.*}} %V1024 = shufflevector
202 ; AVX2: cost of 128 {{.*}} %V1024 = shufflevector
203 ; AVX512F: cost of 128 {{.*}} %V1024 = shufflevector
202 ; AVX2: cost of 196 {{.*}} %V1024 = shufflevector
203 ; AVX512F: cost of 196 {{.*}} %V1024 = shufflevector
204204 ; AVX512BW: cost of 6 {{.*}} %V1024 = shufflevector
205205 ; AVX512VBMI: cost of 6 {{.*}} %V1024 = shufflevector
206206 %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32>
225225 ; SSSE3: cost of 18 {{.*}} %V256 = shufflevector
226226 ; SSE42: cost of 18 {{.*}} %V256 = shufflevector
227227 ; AVX1: cost of 64 {{.*}} %V256 = shufflevector
228 ; AVX2: cost of 64 {{.*}} %V256 = shufflevector
229 ; AVX512F: cost of 64 {{.*}} %V256 = shufflevector
228 ; AVX2: cost of 7 {{.*}} %V256 = shufflevector
229 ; AVX512F: cost of 7 {{.*}} %V256 = shufflevector
230230 ; AVX512BW: cost of 3 {{.*}} %V256 = shufflevector
231231 ; AVX512VBMI: cost of 1 {{.*}} %V256 = shufflevector
232232 %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32>
235235 ; SSSE3: cost of 84 {{.*}} %V512 = shufflevector
236236 ; SSE42: cost of 84 {{.*}} %V512 = shufflevector
237237 ; AVX1: cost of 128 {{.*}} %V512 = shufflevector
238 ; AVX2: cost of 128 {{.*}} %V512 = shufflevector
239 ; AVX512F: cost of 128 {{.*}} %V512 = shufflevector
238 ; AVX2: cost of 42 {{.*}} %V512 = shufflevector
239 ; AVX512F: cost of 42 {{.*}} %V512 = shufflevector
240240 ; AVX512BW: cost of 19 {{.*}} %V512 = shufflevector
241241 ; AVX512VBMI: cost of 1 {{.*}} %V512 = shufflevector
242242 %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32>