llvm.org GIT mirror llvm / 53b47dd
[X86][AVX512] Add 512-bit vector rotate tests git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@308146 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 3 years ago
1 changed file(s) with 850 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VLBW
5
6 ;
7 ; Variable Rotates
8 ;
9
10 define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
11 ; AVX512-LABEL: var_rotate_v8i64:
12 ; AVX512: # BB#0:
13 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [64,64,64,64,64,64,64,64]
14 ; AVX512-NEXT: vpsubq %zmm1, %zmm2, %zmm2
15 ; AVX512-NEXT: vpsllvq %zmm1, %zmm0, %zmm1
16 ; AVX512-NEXT: vpsrlvq %zmm2, %zmm0, %zmm0
17 ; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0
18 ; AVX512-NEXT: retq
19 %b64 = sub <8 x i64> , %b
20 %shl = shl <8 x i64> %a, %b
21 %lshr = lshr <8 x i64> %a, %b64
22 %or = or <8 x i64> %shl, %lshr
23 ret <8 x i64> %or
24 }
25
26 define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
27 ; AVX512-LABEL: var_rotate_v16i32:
28 ; AVX512: # BB#0:
29 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
30 ; AVX512-NEXT: vpsubd %zmm1, %zmm2, %zmm2
31 ; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
32 ; AVX512-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
33 ; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm0
34 ; AVX512-NEXT: retq
35 %b32 = sub <16 x i32> , %b
36 %shl = shl <16 x i32> %a, %b
37 %lshr = lshr <16 x i32> %a, %b32
38 %or = or <16 x i32> %shl, %lshr
39 ret <16 x i32> %or
40 }
41
42 define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
43 ; AVX512F-LABEL: var_rotate_v32i16:
44 ; AVX512F: # BB#0:
45 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
46 ; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm5
47 ; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm4
48 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
49 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
50 ; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
51 ; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
52 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
53 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
54 ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
55 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
56 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
57 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
58 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
59 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
60 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
61 ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0
62 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
63 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
64 ; AVX512F-NEXT: retq
65 ;
66 ; AVX512VL-LABEL: var_rotate_v32i16:
67 ; AVX512VL: # BB#0:
68 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
69 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm5
70 ; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm4
71 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
72 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
73 ; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
74 ; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
75 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
76 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
77 ; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2
78 ; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
79 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
80 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
81 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
82 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
83 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
84 ; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm0, %zmm0
85 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
86 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
87 ; AVX512VL-NEXT: retq
88 ;
89 ; AVX512BW-LABEL: var_rotate_v32i16:
90 ; AVX512BW: # BB#0:
91 ; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
92 ; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
93 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
94 ; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
95 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
96 ; AVX512BW-NEXT: retq
97 ;
98 ; AVX512VLBW-LABEL: var_rotate_v32i16:
99 ; AVX512VLBW: # BB#0:
100 ; AVX512VLBW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
101 ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
102 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
103 ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
104 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
105 ; AVX512VLBW-NEXT: retq
106 %b16 = sub <32 x i16> , %b
107 %shl = shl <32 x i16> %a, %b
108 %lshr = lshr <32 x i16> %a, %b16
109 %or = or <32 x i16> %shl, %lshr
110 ret <32 x i16> %or
111 }
112
113 define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
114 ; AVX512F-LABEL: var_rotate_v64i8:
115 ; AVX512F: # BB#0:
116 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
117 ; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm4
118 ; AVX512F-NEXT: vpsubb %ymm3, %ymm5, %ymm5
119 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm6
120 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
121 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
122 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
123 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm1, %ymm6
124 ; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm8
125 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
126 ; AVX512F-NEXT: vpand %ymm9, %ymm8, %ymm8
127 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
128 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm6
129 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8
130 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
131 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm3
132 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
133 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
134 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
135 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm6
136 ; AVX512F-NEXT: vpsllw $2, %ymm6, %ymm7
137 ; AVX512F-NEXT: vpand %ymm9, %ymm7, %ymm7
138 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
139 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
140 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm7
141 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
142 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm2
143 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6
144 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
145 ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
146 ; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm5
147 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
148 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6
149 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
150 ; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6
151 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
152 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
153 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6
154 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
155 ; AVX512F-NEXT: vpand %ymm9, %ymm6, %ymm6
156 ; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
157 ; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
158 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
159 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
160 ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
161 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4
162 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
163 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm3
164 ; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3
165 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
166 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
167 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm3
168 ; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3
169 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4
170 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
171 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
172 ; AVX512F-NEXT: retq
173 ;
174 ; AVX512VL-LABEL: var_rotate_v64i8:
175 ; AVX512VL: # BB#0:
176 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
177 ; AVX512VL-NEXT: vpsubb %ymm2, %ymm5, %ymm4
178 ; AVX512VL-NEXT: vpsubb %ymm3, %ymm5, %ymm5
179 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm6
180 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
181 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
182 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
183 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm1, %ymm6
184 ; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm8
185 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
186 ; AVX512VL-NEXT: vpand %ymm9, %ymm8, %ymm8
187 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
188 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm6
189 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
190 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
191 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm8, %ymm6, %ymm3
192 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
193 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
194 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
195 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm6
196 ; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm7
197 ; AVX512VL-NEXT: vpand %ymm9, %ymm7, %ymm7
198 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
199 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
200 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm7
201 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
202 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm2
203 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
204 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
205 ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
206 ; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5
207 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
208 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6
209 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
210 ; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6
211 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
212 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
213 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
214 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
215 ; AVX512VL-NEXT: vpand %ymm9, %ymm6, %ymm6
216 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
217 ; AVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm1
218 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm5
219 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
220 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
221 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
222 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm5
223 ; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
224 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
225 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
226 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm5
227 ; AVX512VL-NEXT: vpand %ymm9, %ymm5, %ymm5
228 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
229 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm0, %ymm0
230 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
231 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
232 ; AVX512VL-NEXT: retq
233 ;
234 ; AVX512BW-LABEL: var_rotate_v64i8:
235 ; AVX512BW: # BB#0:
236 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
237 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
238 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
239 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
240 ; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
241 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
242 ; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
243 ; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm4
244 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
245 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
246 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
247 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
248 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
249 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
250 ; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
251 ; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm1
252 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
253 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
254 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
255 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
256 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
257 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
258 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
259 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
260 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
261 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
262 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
263 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
264 ; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
265 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
266 ; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
267 ; AVX512BW-NEXT: retq
268 ;
269 ; AVX512VLBW-LABEL: var_rotate_v64i8:
270 ; AVX512VLBW: # BB#0:
271 ; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
272 ; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
273 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
274 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
275 ; AVX512VLBW-NEXT: vpsllw $5, %zmm1, %zmm1
276 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
277 ; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
278 ; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm4
279 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
280 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
281 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
282 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
283 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
284 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
285 ; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
286 ; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm1
287 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm2
288 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
289 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k2
290 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm1
291 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
292 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
293 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm1
294 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
295 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
296 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm1
297 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
298 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2
299 ; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
300 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
301 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
302 ; AVX512VLBW-NEXT: retq
303 %b8 = sub <64 x i8> , %b
304 %shl = shl <64 x i8> %a, %b
305 %lshr = lshr <64 x i8> %a, %b8
306 %or = or <64 x i8> %shl, %lshr
307 ret <64 x i8> %or
308 }
309
310 ;
311 ; Constant Rotates
312 ;
313
314 define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
315 ; AVX512-LABEL: constant_rotate_v8i64:
316 ; AVX512: # BB#0:
317 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm1
318 ; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %zmm0, %zmm0
319 ; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0
320 ; AVX512-NEXT: retq
321 %shl = shl <8 x i64> %a,
322 %lshr = lshr <8 x i64> %a,
323 %or = or <8 x i64> %shl, %lshr
324 ret <8 x i64> %or
325 }
326
327 define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
328 ; AVX512-LABEL: constant_rotate_v16i32:
329 ; AVX512: # BB#0:
330 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1
331 ; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
332 ; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm0
333 ; AVX512-NEXT: retq
334 %shl = shl <16 x i32> %a,
335 %lshr = lshr <16 x i32> %a,
336 %or = or <16 x i32> %shl, %lshr
337 ret <16 x i32> %or
338 }
339
340 define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
341 ; AVX512F-LABEL: constant_rotate_v32i16:
342 ; AVX512F: # BB#0:
343 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
344 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
345 ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2
346 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
347 ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
348 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
349 ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
350 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
351 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
352 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm0, %zmm0
353 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
354 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
355 ; AVX512F-NEXT: retq
356 ;
357 ; AVX512VL-LABEL: constant_rotate_v32i16:
358 ; AVX512VL: # BB#0:
359 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
360 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3
361 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2
362 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
363 ; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm4 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
364 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
365 ; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
366 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
367 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm0, %zmm0
368 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
369 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
370 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
371 ; AVX512VL-NEXT: retq
372 ;
373 ; AVX512BW-LABEL: constant_rotate_v32i16:
374 ; AVX512BW: # BB#0:
375 ; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
376 ; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
377 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
378 ; AVX512BW-NEXT: retq
379 ;
380 ; AVX512VLBW-LABEL: constant_rotate_v32i16:
381 ; AVX512VLBW: # BB#0:
382 ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
383 ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
384 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
385 ; AVX512VLBW-NEXT: retq
386 %shl = shl <32 x i16> %a,
387 %lshr = lshr <32 x i16> %a,
388 %or = or <32 x i16> %shl, %lshr
389 ret <32 x i16> %or
390 }
391
392 define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
393 ; AVX512F-LABEL: constant_rotate_v64i8:
394 ; AVX512F: # BB#0:
395 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
396 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
397 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
398 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
399 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
400 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
401 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
402 ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
403 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm7
404 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
405 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
406 ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
407 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
408 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
409 ; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm3
410 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
411 ; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
412 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
413 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
414 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
415 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
416 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4
417 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
418 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
419 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
420 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
421 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm4
422 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
423 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
424 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8
425 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1
426 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm4
427 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
428 ; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4
429 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm10
430 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1
431 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
432 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
433 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
434 ; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
435 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2
436 ; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
437 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
438 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2
439 ; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2
440 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm0, %ymm0
441 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
442 ; AVX512F-NEXT: retq
443 ;
444 ; AVX512VL-LABEL: constant_rotate_v64i8:
445 ; AVX512VL: # BB#0:
446 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
447 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
448 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
449 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
450 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
451 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
452 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
453 ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
454 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
455 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm2, %ymm2
456 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
457 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
458 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
459 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
460 ; AVX512VL-NEXT: vpand %ymm3, %ymm5, %ymm3
461 ; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm3
462 ; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
463 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
464 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
465 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
466 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
467 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
468 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
469 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
470 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
471 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
472 ; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4
473 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
474 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
475 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
476 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm1
477 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4
478 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
479 ; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4
480 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm10
481 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1
482 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
483 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
484 ; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
485 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4
486 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
487 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
488 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4
489 ; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4
490 ; AVX512VL-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0
491 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
492 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
493 ; AVX512VL-NEXT: retq
494 ;
495 ; AVX512BW-LABEL: constant_rotate_v64i8:
496 ; AVX512BW: # BB#0:
497 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
498 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
499 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
500 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
501 ; AVX512BW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
502 ; AVX512BW-NEXT: vpsllw $2, %zmm2, %zmm3
503 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
504 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
505 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
506 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
507 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
508 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
509 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
510 ; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
511 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
512 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
513 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
514 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
515 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm3
516 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
517 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
518 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
519 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
520 ; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm3
521 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
522 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
523 ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
524 ; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
525 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
526 ; AVX512BW-NEXT: retq
527 ;
528 ; AVX512VLBW-LABEL: constant_rotate_v64i8:
529 ; AVX512VLBW: # BB#0:
530 ; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
531 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
532 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
533 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
534 ; AVX512VLBW-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
535 ; AVX512VLBW-NEXT: vpsllw $2, %zmm2, %zmm3
536 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
537 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
538 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
539 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
540 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
541 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
542 ; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
543 ; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
544 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
545 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3
546 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
547 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
548 ; AVX512VLBW-NEXT: vpsrlw $2, %zmm0, %zmm3
549 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
550 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
551 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
552 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
553 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm0, %zmm3
554 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
555 ; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
556 ; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
557 ; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
558 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
559 ; AVX512VLBW-NEXT: retq
560 %shl = shl <64 x i8> %a,
561 %lshr = lshr <64 x i8> %a,
562 %or = or <64 x i8> %shl, %lshr
563 ret <64 x i8> %or
564 }
565
566 ;
567 ; Uniform Constant Rotates
568 ;
569
570 define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
571 ; AVX512-LABEL: splatconstant_rotate_v8i64:
572 ; AVX512: # BB#0:
573 ; AVX512-NEXT: vpsllq $14, %zmm0, %zmm1
574 ; AVX512-NEXT: vpsrlq $50, %zmm0, %zmm0
575 ; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0
576 ; AVX512-NEXT: retq
577 %shl = shl <8 x i64> %a,
578 %lshr = lshr <8 x i64> %a,
579 %or = or <8 x i64> %shl, %lshr
580 ret <8 x i64> %or
581 }
582
583 define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
584 ; AVX512-LABEL: splatconstant_rotate_v16i32:
585 ; AVX512: # BB#0:
586 ; AVX512-NEXT: vpslld $4, %zmm0, %zmm1
587 ; AVX512-NEXT: vpsrld $28, %zmm0, %zmm0
588 ; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm0
589 ; AVX512-NEXT: retq
590 %shl = shl <16 x i32> %a,
591 %lshr = lshr <16 x i32> %a,
592 %or = or <16 x i32> %shl, %lshr
593 ret <16 x i32> %or
594 }
595
596 define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
597 ; AVX512F-LABEL: splatconstant_rotate_v32i16:
598 ; AVX512F: # BB#0:
599 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm2
600 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
601 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1
602 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
603 ; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm0
604 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
605 ; AVX512F-NEXT: retq
606 ;
607 ; AVX512VL-LABEL: splatconstant_rotate_v32i16:
608 ; AVX512VL: # BB#0:
609 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm2
610 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3
611 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1
612 ; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm0
613 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
614 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
615 ; AVX512VL-NEXT: retq
616 ;
617 ; AVX512BW-LABEL: splatconstant_rotate_v32i16:
618 ; AVX512BW: # BB#0:
619 ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1
620 ; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0
621 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
622 ; AVX512BW-NEXT: retq
623 ;
624 ; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
625 ; AVX512VLBW: # BB#0:
626 ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1
627 ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0
628 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
629 ; AVX512VLBW-NEXT: retq
630 %shl = shl <32 x i16> %a,
631 %lshr = lshr <32 x i16> %a,
632 %or = or <32 x i16> %shl, %lshr
633 ret <32 x i16> %or
634 }
635
636 define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
637 ; AVX512F-LABEL: splatconstant_rotate_v64i8:
638 ; AVX512F: # BB#0:
639 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
640 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
641 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
642 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
643 ; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
644 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
645 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
646 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
647 ; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
648 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
649 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
650 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
651 ; AVX512F-NEXT: retq
652 ;
653 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
654 ; AVX512VL: # BB#0:
655 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
656 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
657 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
658 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
659 ; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
660 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
661 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
662 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
663 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
664 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
665 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
666 ; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
667 ; AVX512VL-NEXT: retq
668 ;
669 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
670 ; AVX512BW: # BB#0:
671 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
672 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
673 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
674 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
675 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
676 ; AVX512BW-NEXT: retq
677 ;
678 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
679 ; AVX512VLBW: # BB#0:
680 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
681 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
682 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
683 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
684 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
685 ; AVX512VLBW-NEXT: retq
686 %shl = shl <64 x i8> %a,
687 %lshr = lshr <64 x i8> %a,
688 %or = or <64 x i8> %shl, %lshr
689 ret <64 x i8> %or
690 }
691
692 ;
693 ; Masked Uniform Constant Rotates
694 ;
695
696 define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
697 ; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
698 ; AVX512: # BB#0:
699 ; AVX512-NEXT: vpsrlq $49, %zmm0, %zmm0
700 ; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
701 ; AVX512-NEXT: retq
702 %shl = shl <8 x i64> %a,
703 %lshr = lshr <8 x i64> %a,
704 %rmask = and <8 x i64> %lshr,
705 %lmask = and <8 x i64> %shl,
706 %or = or <8 x i64> %lmask, %rmask
707 ret <8 x i64> %or
708 }
709
710 define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
711 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
712 ; AVX512: # BB#0:
713 ; AVX512-NEXT: vpslld $4, %zmm0, %zmm1
714 ; AVX512-NEXT: vpsrld $28, %zmm0, %zmm0
715 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
716 ; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm1, %zmm1
717 ; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0
718 ; AVX512-NEXT: retq
719 %shl = shl <16 x i32> %a,
720 %lshr = lshr <16 x i32> %a,
721 %rmask = and <16 x i32> %lshr,
722 %lmask = and <16 x i32> %shl,
723 %or = or <16 x i32> %lmask, %rmask
724 ret <16 x i32> %or
725 }
726
727 define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
728 ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
729 ; AVX512F: # BB#0:
730 ; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm2
731 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm3
732 ; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0
733 ; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm1
734 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
735 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
736 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
737 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
738 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
739 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
740 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
741 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
742 ; AVX512F-NEXT: retq
743 ;
744 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
745 ; AVX512VL: # BB#0:
746 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm2
747 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm3
748 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
749 ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm1
750 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
751 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
752 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
753 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
754 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
755 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
756 ; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
757 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
758 ; AVX512VL-NEXT: retq
759 ;
760 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
761 ; AVX512BW: # BB#0:
762 ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
763 ; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0
764 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
765 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
766 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
767 ; AVX512BW-NEXT: retq
768 ;
769 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
770 ; AVX512VLBW: # BB#0:
771 ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
772 ; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0
773 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
774 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
775 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
776 ; AVX512VLBW-NEXT: retq
777 %shl = shl <32 x i16> %a,
778 %lshr = lshr <32 x i16> %a,
779 %rmask = and <32 x i16> %lshr,
780 %lmask = and <32 x i16> %shl,
781 %or = or <32 x i16> %lmask, %rmask
782 ret <32 x i16> %or
783 }
784
785 define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
786 ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
787 ; AVX512F: # BB#0:
788 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
789 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm3
790 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
791 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
792 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
793 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
794 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
795 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
796 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
797 ; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
798 ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
799 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
800 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
801 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
802 ; AVX512F-NEXT: retq
803 ;
804 ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
805 ; AVX512VL: # BB#0:
806 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
807 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm3
808 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
809 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
810 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
811 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
812 ; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
813 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
814 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33,33]
815 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
816 ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
817 ; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
818 ; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
819 ; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
820 ; AVX512VL-NEXT: retq
821 ;
822 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
823 ; AVX512BW: # BB#0:
824 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
825 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
826 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
827 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
828 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
829 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
830 ; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
831 ; AVX512BW-NEXT: retq
832 ;
833 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
834 ; AVX512VLBW: # BB#0:
835 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
836 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
837 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
838 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
839 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
840 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
841 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
842 ; AVX512VLBW-NEXT: retq
843 %shl = shl <64 x i8> %a,
844 %lshr = lshr <64 x i8> %a,
845 %rmask = and <64 x i8> %lshr,
846 %lmask = and <64 x i8> %shl,
847 %or = or <64 x i8> %lmask, %rmask
848 ret <64 x i8> %or
849 }