llvm.org GIT mirror llvm / b0b94e1
[CostModel][X86] Improve single src shuffle costs Add missing SK_PermuteSingleSrc costs for AVX2 targets and earlier, also added some of the simpler SK_PermuteTwoSrc costs to support splitting of SK_PermuteSingleSrc shuffles git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310632 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 2 years ago
2 changed file(s) with 96 addition(s) and 71 deletion(s). Raw diff Collapse all Expand all
837837 { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
838838 { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb
839839
840 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
841 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
840842 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
841843 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
842844 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
871873 { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
872874 { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
873875 { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
874 { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor
876 { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor
877
878 { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
879 { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
880 { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
881 { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
882 { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
883 // + 2*por + vinsertf128
884 { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
885 // + 2*por + vinsertf128
875886 };
876887
877888 if (ST->hasAVX())
898909 { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
899910 { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
900911
901 { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
902 { TTI::SK_Alternate, MVT::v16i8, 3 }, // pshufb + pshufb + por
912 { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por
913 { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por
903914
904915 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
905 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 } // pshufb
916 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
917
918 { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
919 { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
906920 };
907921
908922 if (ST->hasSSSE3())
913927 { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
914928 { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
915929 { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
916 { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
930 { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
917931 { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
918932
919933 { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
920934 { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
921935 { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
922 { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
936 { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
923937 { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
924938 // + 2*pshufd + 2*unpck + packus
925939
929943 { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
930944 { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por
931945
932 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
933 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 } // pshufd
946 { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
947 { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
948 { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
949 { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw
950 // + pshufd/unpck
951 { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
952 // + 2*pshufd + 2*unpck + 2*packus
953
954 { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
955 { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
956 { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
934957 };
935958
936959 if (ST->hasSSE2())
938961 return LT.first * Entry->Cost;
939962
940963 static const CostTblEntry SSE1ShuffleTbl[] = {
941 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
942 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
943 { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
964 { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
965 { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
966 { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps
967 { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
968 { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
944969 };
945970
946971 if (ST->hasSSE1())
1212 ; CHECK-LABEL: 'test_vXf64'
1313 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024) {
1414
15 ; SSE2: cost of 2 {{.*}} %V128 = shufflevector
16 ; SSSE3: cost of 2 {{.*}} %V128 = shufflevector
17 ; SSE42: cost of 2 {{.*}} %V128 = shufflevector
18 ; AVX1: cost of 2 {{.*}} %V128 = shufflevector
19 ; AVX2: cost of 2 {{.*}} %V128 = shufflevector
15 ; SSE2: cost of 1 {{.*}} %V128 = shufflevector
16 ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
17 ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
18 ; AVX1: cost of 1 {{.*}} %V128 = shufflevector
19 ; AVX2: cost of 1 {{.*}} %V128 = shufflevector
2020 ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
2121 %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32>
2222
23 ; SSE2: cost of 4 {{.*}} %V256 = shufflevector
24 ; SSSE3: cost of 4 {{.*}} %V256 = shufflevector
25 ; SSE42: cost of 4 {{.*}} %V256 = shufflevector
26 ; AVX1: cost of 6 {{.*}} %V256 = shufflevector
27 ; AVX2: cost of 6 {{.*}} %V256 = shufflevector
23 ; SSE2: cost of 2 {{.*}} %V256 = shufflevector
24 ; SSSE3: cost of 2 {{.*}} %V256 = shufflevector
25 ; SSE42: cost of 2 {{.*}} %V256 = shufflevector
26 ; AVX1: cost of 3 {{.*}} %V256 = shufflevector
27 ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
2828 ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
2929 %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32>
3030
31 ; SSE2: cost of 24 {{.*}} %V512 = shufflevector
32 ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
33 ; SSE42: cost of 24 {{.*}} %V512 = shufflevector
31 ; SSE2: cost of 12 {{.*}} %V512 = shufflevector
32 ; SSSE3: cost of 12 {{.*}} %V512 = shufflevector
33 ; SSE42: cost of 12 {{.*}} %V512 = shufflevector
3434 ; AVX1: cost of 12 {{.*}} %V512 = shufflevector
3535 ; AVX2: cost of 12 {{.*}} %V512 = shufflevector
3636 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
3737 %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32>
3838
39 ; SSE2: cost of 112 {{.*}} %V1024 = shufflevector
40 ; SSSE3: cost of 112 {{.*}} %V1024 = shufflevector
41 ; SSE42: cost of 112 {{.*}} %V1024 = shufflevector
39 ; SSE2: cost of 56 {{.*}} %V1024 = shufflevector
40 ; SSSE3: cost of 56 {{.*}} %V1024 = shufflevector
41 ; SSE42: cost of 56 {{.*}} %V1024 = shufflevector
4242 ; AVX1: cost of 72 {{.*}} %V1024 = shufflevector
4343 ; AVX2: cost of 72 {{.*}} %V1024 = shufflevector
4444 ; AVX512: cost of 2 {{.*}} %V1024 = shufflevector
5858 ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
5959 %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32>
6060
61 ; SSE2: cost of 8 {{.*}} %V256 = shufflevector
62 ; SSSE3: cost of 8 {{.*}} %V256 = shufflevector
63 ; SSE42: cost of 8 {{.*}} %V256 = shufflevector
64 ; AVX1: cost of 8 {{.*}} %V256 = shufflevector
61 ; SSE2: cost of 2 {{.*}} %V256 = shufflevector
62 ; SSSE3: cost of 2 {{.*}} %V256 = shufflevector
63 ; SSE42: cost of 2 {{.*}} %V256 = shufflevector
64 ; AVX1: cost of 3 {{.*}} %V256 = shufflevector
6565 ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
6666 ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
6767 %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32>
6868
69 ; SSE2: cost of 48 {{.*}} %V512 = shufflevector
70 ; SSSE3: cost of 48 {{.*}} %V512 = shufflevector
71 ; SSE42: cost of 48 {{.*}} %V512 = shufflevector
69 ; SSE2: cost of 12 {{.*}} %V512 = shufflevector
70 ; SSSE3: cost of 12 {{.*}} %V512 = shufflevector
71 ; SSE42: cost of 12 {{.*}} %V512 = shufflevector
7272 ; AVX1: cost of 16 {{.*}} %V512 = shufflevector
7373 ; AVX2: cost of 16 {{.*}} %V512 = shufflevector
7474 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
8080 ; CHECK-LABEL: 'test_vXf32'
8181 define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
8282
83 ; SSE2: cost of 6 {{.*}} %V128 = shufflevector
84 ; SSSE3: cost of 6 {{.*}} %V128 = shufflevector
85 ; SSE42: cost of 6 {{.*}} %V128 = shufflevector
86 ; AVX1: cost of 6 {{.*}} %V128 = shufflevector
87 ; AVX2: cost of 6 {{.*}} %V128 = shufflevector
83 ; SSE2: cost of 1 {{.*}} %V128 = shufflevector
84 ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
85 ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
86 ; AVX1: cost of 1 {{.*}} %V128 = shufflevector
87 ; AVX2: cost of 1 {{.*}} %V128 = shufflevector
8888 ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
8989 %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32>
9090
91 ; SSE2: cost of 12 {{.*}} %V256 = shufflevector
92 ; SSSE3: cost of 12 {{.*}} %V256 = shufflevector
93 ; SSE42: cost of 12 {{.*}} %V256 = shufflevector
94 ; AVX1: cost of 14 {{.*}} %V256 = shufflevector
95 ; AVX2: cost of 14 {{.*}} %V256 = shufflevector
91 ; SSE2: cost of 4 {{.*}} %V256 = shufflevector
92 ; SSSE3: cost of 4 {{.*}} %V256 = shufflevector
93 ; SSE42: cost of 4 {{.*}} %V256 = shufflevector
94 ; AVX1: cost of 4 {{.*}} %V256 = shufflevector
95 ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
9696 ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
9797 %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32>
9898
99 ; SSE2: cost of 72 {{.*}} %V512 = shufflevector
100 ; SSSE3: cost of 72 {{.*}} %V512 = shufflevector
101 ; SSE42: cost of 72 {{.*}} %V512 = shufflevector
99 ; SSE2: cost of 24 {{.*}} %V512 = shufflevector
100 ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
101 ; SSE42: cost of 24 {{.*}} %V512 = shufflevector
102102 ; AVX1: cost of 28 {{.*}} %V512 = shufflevector
103103 ; AVX2: cost of 28 {{.*}} %V512 = shufflevector
104104 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
118118 ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
119119 %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32>
120120
121 ; SSE2: cost of 16 {{.*}} %V256 = shufflevector
122 ; SSSE3: cost of 16 {{.*}} %V256 = shufflevector
123 ; SSE42: cost of 16 {{.*}} %V256 = shufflevector
124 ; AVX1: cost of 16 {{.*}} %V256 = shufflevector
121 ; SSE2: cost of 4 {{.*}} %V256 = shufflevector
122 ; SSSE3: cost of 4 {{.*}} %V256 = shufflevector
123 ; SSE42: cost of 4 {{.*}} %V256 = shufflevector
124 ; AVX1: cost of 4 {{.*}} %V256 = shufflevector
125125 ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
126126 ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
127127 %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32>
128128
129 ; SSE2: cost of 96 {{.*}} %V512 = shufflevector
130 ; SSSE3: cost of 96 {{.*}} %V512 = shufflevector
131 ; SSE42: cost of 96 {{.*}} %V512 = shufflevector
129 ; SSE2: cost of 24 {{.*}} %V512 = shufflevector
130 ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
131 ; SSE42: cost of 24 {{.*}} %V512 = shufflevector
132132 ; AVX1: cost of 32 {{.*}} %V512 = shufflevector
133133 ; AVX2: cost of 32 {{.*}} %V512 = shufflevector
134134 ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
135135 %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32>
136136
137 ; SSE2: cost of 448 {{.*}} %V1024 = shufflevector
138 ; SSSE3: cost of 448 {{.*}} %V1024 = shufflevector
139 ; SSE42: cost of 448 {{.*}} %V1024 = shufflevector
137 ; SSE2: cost of 112 {{.*}} %V1024 = shufflevector
138 ; SSSE3: cost of 112 {{.*}} %V1024 = shufflevector
139 ; SSE42: cost of 112 {{.*}} %V1024 = shufflevector
140140 ; AVX1: cost of 192 {{.*}} %V1024 = shufflevector
141141 ; AVX2: cost of 192 {{.*}} %V1024 = shufflevector
142142 ; AVX512: cost of 2 {{.*}} %V1024 = shufflevector
147147 ; CHECK-LABEL: 'test_vXi16'
148148 define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024) {
149149
150 ; SSE2: cost of 16 {{.*}} %V128 = shufflevector
150 ; SSE2: cost of 5 {{.*}} %V128 = shufflevector
151151 ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
152152 ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
153153 ; AVX1: cost of 1 {{.*}} %V128 = shufflevector
157157 %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32>
158158
159159 ; SSE2: cost of 32 {{.*}} %V256 = shufflevector
160 ; SSSE3: cost of 32 {{.*}} %V256 = shufflevector
161 ; SSE42: cost of 32 {{.*}} %V256 = shufflevector
162 ; AVX1: cost of 32 {{.*}} %V256 = shufflevector
160 ; SSSE3: cost of 6 {{.*}} %V256 = shufflevector
161 ; SSE42: cost of 6 {{.*}} %V256 = shufflevector
162 ; AVX1: cost of 8 {{.*}} %V256 = shufflevector
163163 ; AVX2: cost of 4 {{.*}} %V256 = shufflevector
164164 ; AVX512F: cost of 4 {{.*}} %V256 = shufflevector
165165 ; AVX512BW cost of 1 {{.*}} %V256 = shufflevector
166166 %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32>
167167
168168 ; SSE2: cost of 192 {{.*}} %V512 = shufflevector
169 ; SSSE3: cost of 192 {{.*}} %V512 = shufflevector
170 ; SSE42: cost of 192 {{.*}} %V512 = shufflevector
169 ; SSSE3: cost of 36 {{.*}} %V512 = shufflevector
170 ; SSE42: cost of 36 {{.*}} %V512 = shufflevector
171171 ; AVX1: cost of 64 {{.*}} %V512 = shufflevector
172172 ; AVX2: cost of 64 {{.*}} %V512 = shufflevector
173173 ; AVX512F: cost of 64 {{.*}} %V512 = shufflevector
175175 %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32>
176176
177177 ; SSE2: cost of 896 {{.*}} %V1024 = shufflevector
178 ; SSSE3: cost of 896 {{.*}} %V1024 = shufflevector
179 ; SSE42: cost of 896 {{.*}} %V1024 = shufflevector
178 ; SSSE3: cost of 168 {{.*}} %V1024 = shufflevector
179 ; SSE42: cost of 168 {{.*}} %V1024 = shufflevector
180180 ; AVX1: cost of 384 {{.*}} %V1024 = shufflevector
181181 ; AVX2: cost of 384 {{.*}} %V1024 = shufflevector
182182 ; AVX512F: cost of 384 {{.*}} %V1024 = shufflevector
187187
188188 ; CHECK-LABEL: 'test_vXi8'
189189 define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
190 ; SSE2: cost of 32 {{.*}} %V128 = shufflevector
190 ; SSE2: cost of 10 {{.*}} %V128 = shufflevector
191191 ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
192192 ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
193193 ; AVX1: cost of 1 {{.*}} %V128 = shufflevector
196196 %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32>
197197
198198 ; SSE2: cost of 64 {{.*}} %V256 = shufflevector
199 ; SSSE3: cost of 64 {{.*}} %V256 = shufflevector
200 ; SSE42: cost of 64 {{.*}} %V256 = shufflevector
201 ; AVX1: cost of 64 {{.*}} %V256 = shufflevector
199 ; SSSE3: cost of 6 {{.*}} %V256 = shufflevector
200 ; SSE42: cost of 6 {{.*}} %V256 = shufflevector
201 ; AVX1: cost of 8 {{.*}} %V256 = shufflevector
202202 ; AVX2: cost of 4 {{.*}} %V256 = shufflevector
203203 ; AVX512F: cost of 4 {{.*}} %V256 = shufflevector
204204 ; AVX512BW: cost of 3 {{.*}} %V256 = shufflevector
205205 %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32>
206206
207207 ; SSE2: cost of 384 {{.*}} %V512 = shufflevector
208 ; SSSE3: cost of 384 {{.*}} %V512 = shufflevector
209 ; SSE42: cost of 384 {{.*}} %V512 = shufflevector
208 ; SSSE3: cost of 36 {{.*}} %V512 = shufflevector
209 ; SSE42: cost of 36 {{.*}} %V512 = shufflevector
210210 ; AVX1: cost of 128 {{.*}} %V512 = shufflevector
211211 ; AVX2: cost of 128 {{.*}} %V512 = shufflevector
212212 ; AVX512F: cost of 128 {{.*}} %V512 = shufflevector