llvm.org GIT mirror llvm / 50f3e3a
[X86][AVX512] Add support for masked shuffle comments This patch adds support for including the avx512 mask register information in the mask/maskz versions of shuffle instruction comments. This initial version just adds support for MOVDDUP/MOVSHDUP/MOVSLDUP to reduce the mass of test regenerations, other shuffle instructions can be added in due course. Differential Revision: http://reviews.llvm.org/D21953 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274459 91177308-0d34-0410-b5e6-96231b3b80d8 Simon Pilgrim 4 years ago
8 changed file(s) with 113 addition(s) and 62 deletion(s). Raw diff Collapse all Expand all
4444 CASE_AVX_INS_COMMON(Inst, , r##src) \
4545 CASE_AVX_INS_COMMON(Inst, Y, r##src) \
4646 CASE_SSE_INS_COMMON(Inst, r##src)
47
48 #define CASE_MASK_MOVDUP(Inst, src) \
49 CASE_MASK_INS_COMMON(Inst, Z, r##src) \
50 CASE_MASK_INS_COMMON(Inst, Z256, r##src) \
51 CASE_MASK_INS_COMMON(Inst, Z128, r##src)
52
53 #define CASE_MASKZ_MOVDUP(Inst, src) \
54 CASE_MASKZ_INS_COMMON(Inst, Z, r##src) \
55 CASE_MASKZ_INS_COMMON(Inst, Z256, r##src) \
56 CASE_MASKZ_INS_COMMON(Inst, Z128, r##src)
4757
4858 #define CASE_PMOVZX(Inst, src) \
4959 CASE_AVX512_INS_COMMON(Inst, Z, r##src) \
128138 }
129139 }
130140
141 /// Wraps the destination register name with AVX512 mask/maskz filtering.
142 static std::string getMaskName(const MCInst *MI, const char *DestName,
143 const char *(*getRegName)(unsigned)) {
144 std::string OpMaskName(DestName);
145
146 bool MaskWithZero = false;
147 const char *MaskRegName = nullptr;
148
149 switch (MI->getOpcode()) {
150 default:
151 return OpMaskName;
152 CASE_MASKZ_MOVDUP(MOVDDUP, m)
153 CASE_MASKZ_MOVDUP(MOVDDUP, r)
154 CASE_MASKZ_MOVDUP(MOVSHDUP, m)
155 CASE_MASKZ_MOVDUP(MOVSHDUP, r)
156 CASE_MASKZ_MOVDUP(MOVSLDUP, m)
157 CASE_MASKZ_MOVDUP(MOVSLDUP, r)
158 MaskWithZero = true;
159 MaskRegName = getRegName(MI->getOperand(1).getReg());
160 break;
161 CASE_MASK_MOVDUP(MOVDDUP, m)
162 CASE_MASK_MOVDUP(MOVDDUP, r)
163 CASE_MASK_MOVDUP(MOVSHDUP, m)
164 CASE_MASK_MOVDUP(MOVSHDUP, r)
165 CASE_MASK_MOVDUP(MOVSLDUP, m)
166 CASE_MASK_MOVDUP(MOVSLDUP, r)
167 MaskRegName = getRegName(MI->getOperand(2).getReg());
168 break;
169 }
170
171 // MASK: zmmX {%kY}
172 OpMaskName += " {%";
173 OpMaskName += MaskRegName;
174 OpMaskName += "}";
175
176 // MASKZ: zmmX {%kY} {z}
177 if (MaskWithZero)
178 OpMaskName += " {z}";
179
180 return OpMaskName;
181 }
182
131183 //===----------------------------------------------------------------------===//
132184 // Top Level Entrypoint
133185 //===----------------------------------------------------------------------===//
752804 if (ShuffleMask.empty())
753805 return false;
754806
755 // TODO: Add support for specifying an AVX512 style mask register in the comment.
756807 if (!DestName) DestName = Src1Name;
757 OS << (DestName ? DestName : "mem") << " = ";
808 OS << (DestName ? getMaskName(MI, DestName, getRegName) : "mem") << " = ";
758809
759810 // If the two sources are the same, canonicalize the input elements to be
760811 // from the first src so that we get larger element spans.
2222 ; X32: # BB#0:
2323 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
2424 ; X32-NEXT: kmovw %eax, %k1
25 ; X32-NEXT: vmovddup {{.*#+}} zmm0 = zmm1[0,0,2,2,4,4,6,6]
25 ; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
2626 ; X32-NEXT: retl
2727 ;
2828 ; X64-LABEL: test_mm512_mask_movddup_pd:
2929 ; X64: # BB#0:
3030 ; X64-NEXT: kmovw %edi, %k1
31 ; X64-NEXT: vmovddup {{.*#+}} zmm0 = zmm1[0,0,2,2,4,4,6,6]
31 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
3232 ; X64-NEXT: retq
3333 %arg1 = bitcast i8 %a1 to <8 x i1>
3434 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32>
4141 ; X32: # BB#0:
4242 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
4343 ; X32-NEXT: kmovw %eax, %k1
44 ; X32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
44 ; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
4545 ; X32-NEXT: retl
4646 ;
4747 ; X64-LABEL: test_mm512_maskz_movddup_pd:
4848 ; X64: # BB#0:
4949 ; X64-NEXT: kmovw %edi, %k1
50 ; X64-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
50 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
5151 ; X64-NEXT: retq
5252 %arg0 = bitcast i8 %a0 to <8 x i1>
5353 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32>
7474 ; X32: # BB#0:
7575 ; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
7676 ; X32-NEXT: kmovw %eax, %k1
77 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
77 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
7878 ; X32-NEXT: retl
7979 ;
8080 ; X64-LABEL: test_mm512_mask_movehdup_ps:
8181 ; X64: # BB#0:
8282 ; X64-NEXT: kmovw %edi, %k1
83 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
83 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
8484 ; X64-NEXT: retq
8585 %arg1 = bitcast i16 %a1 to <16 x i1>
8686 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32>
9393 ; X32: # BB#0:
9494 ; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
9595 ; X32-NEXT: kmovw %eax, %k1
96 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
96 ; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
9797 ; X32-NEXT: retl
9898 ;
9999 ; X64-LABEL: test_mm512_maskz_movehdup_ps:
100100 ; X64: # BB#0:
101101 ; X64-NEXT: kmovw %edi, %k1
102 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
102 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
103103 ; X64-NEXT: retq
104104 %arg0 = bitcast i16 %a0 to <16 x i1>
105105 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32>
126126 ; X32: # BB#0:
127127 ; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
128128 ; X32-NEXT: kmovw %eax, %k1
129 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
129 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
130130 ; X32-NEXT: retl
131131 ;
132132 ; X64-LABEL: test_mm512_mask_moveldup_ps:
133133 ; X64: # BB#0:
134134 ; X64-NEXT: kmovw %edi, %k1
135 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
135 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
136136 ; X64-NEXT: retq
137137 %arg1 = bitcast i16 %a1 to <16 x i1>
138138 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32>
145145 ; X32: # BB#0:
146146 ; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
147147 ; X32-NEXT: kmovw %eax, %k1
148 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
148 ; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
149149 ; X32-NEXT: retl
150150 ;
151151 ; X64-LABEL: test_mm512_maskz_moveldup_ps:
152152 ; X64: # BB#0:
153153 ; X64-NEXT: kmovw %edi, %k1
154 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
154 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
155155 ; X64-NEXT: retq
156156 %arg0 = bitcast i16 %a0 to <16 x i1>
157157 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32>
77 ; CHECK: ## BB#0:
88 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
99 ; CHECK-NEXT: kmovw %edi, %k1
10 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
11 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
10 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
11 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1212 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
1313 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
1414 ; CHECK-NEXT: retq
2727 ; CHECK: ## BB#0:
2828 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
2929 ; CHECK-NEXT: kmovw %edi, %k1
30 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
31 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
30 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
31 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
3232 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
3333 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
3434 ; CHECK-NEXT: retq
4747 ; CHECK: ## BB#0:
4848 ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
4949 ; CHECK-NEXT: kmovw %edi, %k1
50 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6]
51 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
50 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
51 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
5252 ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
5353 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
5454 ; CHECK-NEXT: retq
2828 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
2929 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3030 ; X32-NEXT: kmovw %eax, %k1
31 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0]
31 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
3232 ; X32-NEXT: popl %eax
3333 ; X32-NEXT: retl
3434 ;
3838 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
3939 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
4040 ; X64-NEXT: kmovw %eax, %k1
41 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0]
41 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
4242 ; X64-NEXT: retq
4343 %trn1 = trunc i8 %a1 to i2
4444 %arg1 = bitcast i2 %trn1 to <2 x i1>
5858 ; X32-NEXT: movb %al, {{[0-9]+}}(%esp)
5959 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6060 ; X32-NEXT: kmovw %eax, %k1
61 ; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
61 ; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
6262 ; X32-NEXT: popl %eax
6363 ; X32-NEXT: retl
6464 ;
6868 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
6969 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
7070 ; X64-NEXT: kmovw %eax, %k1
71 ; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
71 ; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
7272 ; X64-NEXT: retq
7373 %trn1 = trunc i8 %a0 to i2
7474 %arg0 = bitcast i2 %trn1 to <2 x i1>
102102 ; X32-NEXT: movb %al, (%esp)
103103 ; X32-NEXT: movzbl (%esp), %eax
104104 ; X32-NEXT: kmovw %eax, %k1
105 ; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm1[0,0,2,2]
105 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
106106 ; X32-NEXT: popl %eax
107107 ; X32-NEXT: retl
108108 ;
112112 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
113113 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
114114 ; X64-NEXT: kmovw %eax, %k1
115 ; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm1[0,0,2,2]
115 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
116116 ; X64-NEXT: retq
117117 %trn1 = trunc i8 %a1 to i4
118118 %arg1 = bitcast i4 %trn1 to <4 x i1>
132132 ; X32-NEXT: movb %al, (%esp)
133133 ; X32-NEXT: movzbl (%esp), %eax
134134 ; X32-NEXT: kmovw %eax, %k1
135 ; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
135 ; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
136136 ; X32-NEXT: popl %eax
137137 ; X32-NEXT: retl
138138 ;
142142 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
143143 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
144144 ; X64-NEXT: kmovw %eax, %k1
145 ; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
145 ; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
146146 ; X64-NEXT: retq
147147 %trn1 = trunc i8 %a0 to i4
148148 %arg0 = bitcast i4 %trn1 to <4 x i1>
176176 ; X32-NEXT: movb %al, (%esp)
177177 ; X32-NEXT: movzbl (%esp), %eax
178178 ; X32-NEXT: kmovw %eax, %k1
179 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
179 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
180180 ; X32-NEXT: popl %eax
181181 ; X32-NEXT: retl
182182 ;
186186 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
187187 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
188188 ; X64-NEXT: kmovw %eax, %k1
189 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
189 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
190190 ; X64-NEXT: retq
191191 %trn1 = trunc i8 %a1 to i4
192192 %arg1 = bitcast i4 %trn1 to <4 x i1>
206206 ; X32-NEXT: movb %al, (%esp)
207207 ; X32-NEXT: movzbl (%esp), %eax
208208 ; X32-NEXT: kmovw %eax, %k1
209 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
209 ; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
210210 ; X32-NEXT: popl %eax
211211 ; X32-NEXT: retl
212212 ;
216216 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
217217 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
218218 ; X64-NEXT: kmovw %eax, %k1
219 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
219 ; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
220220 ; X64-NEXT: retq
221221 %trn0 = trunc i8 %a0 to i4
222222 %arg0 = bitcast i4 %trn0 to <4 x i1>
244244 ; X32: # BB#0:
245245 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
246246 ; X32-NEXT: kmovw %eax, %k1
247 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm1[1,1,3,3,5,5,7,7]
247 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
248248 ; X32-NEXT: retl
249249 ;
250250 ; X64-LABEL: test_mm256_mask_movehdup_ps:
251251 ; X64: # BB#0:
252252 ; X64-NEXT: kmovw %edi, %k1
253 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm1[1,1,3,3,5,5,7,7]
253 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
254254 ; X64-NEXT: retq
255255 %arg1 = bitcast i8 %a1 to <8 x i1>
256256 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32>
263263 ; X32: # BB#0:
264264 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
265265 ; X32-NEXT: kmovw %eax, %k1
266 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
266 ; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
267267 ; X32-NEXT: retl
268268 ;
269269 ; X64-LABEL: test_mm256_maskz_movehdup_ps:
270270 ; X64: # BB#0:
271271 ; X64-NEXT: kmovw %edi, %k1
272 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
272 ; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
273273 ; X64-NEXT: retq
274274 %arg0 = bitcast i8 %a0 to <8 x i1>
275275 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32>
302302 ; X32-NEXT: movb %al, (%esp)
303303 ; X32-NEXT: movzbl (%esp), %eax
304304 ; X32-NEXT: kmovw %eax, %k1
305 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
305 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
306306 ; X32-NEXT: popl %eax
307307 ; X32-NEXT: retl
308308 ;
312312 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
313313 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
314314 ; X64-NEXT: kmovw %eax, %k1
315 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
315 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
316316 ; X64-NEXT: retq
317317 %trn1 = trunc i8 %a1 to i4
318318 %arg1 = bitcast i4 %trn1 to <4 x i1>
332332 ; X32-NEXT: movb %al, (%esp)
333333 ; X32-NEXT: movzbl (%esp), %eax
334334 ; X32-NEXT: kmovw %eax, %k1
335 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
335 ; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
336336 ; X32-NEXT: popl %eax
337337 ; X32-NEXT: retl
338338 ;
342342 ; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
343343 ; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
344344 ; X64-NEXT: kmovw %eax, %k1
345 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
345 ; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
346346 ; X64-NEXT: retq
347347 %trn0 = trunc i8 %a0 to i4
348348 %arg0 = bitcast i4 %trn0 to <4 x i1>
370370 ; X32: # BB#0:
371371 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
372372 ; X32-NEXT: kmovw %eax, %k1
373 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm1[0,0,2,2,4,4,6,6]
373 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
374374 ; X32-NEXT: retl
375375 ;
376376 ; X64-LABEL: test_mm256_mask_moveldup_ps:
377377 ; X64: # BB#0:
378378 ; X64-NEXT: kmovw %edi, %k1
379 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm1[0,0,2,2,4,4,6,6]
379 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
380380 ; X64-NEXT: retq
381381 %arg1 = bitcast i8 %a1 to <8 x i1>
382382 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32>
389389 ; X32: # BB#0:
390390 ; X32-NEXT: movb {{[0-9]+}}(%esp), %al
391391 ; X32-NEXT: kmovw %eax, %k1
392 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
392 ; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
393393 ; X32-NEXT: retl
394394 ;
395395 ; X64-LABEL: test_mm256_maskz_moveldup_ps:
396396 ; X64: # BB#0:
397397 ; X64-NEXT: kmovw %edi, %k1
398 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
398 ; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
399399 ; X64-NEXT: retq
400400 %arg0 = bitcast i8 %a0 to <8 x i1>
401401 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32>
99 ; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2]
1010 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
1111 ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8]
12 ; CHECK-NEXT: ## xmm1 = xmm0[0,0,2,2]
12 ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2]
1313 ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0]
14 ; CHECK-NEXT: ## xmm0 = xmm0[0,0,2,2]
14 ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2]
1515 ; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca]
1616 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
1717 ; CHECK-NEXT: retq ## encoding: [0xc3]
3232 ; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
3333 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
3434 ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8]
35 ; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2,4,4,6,6]
35 ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
3636 ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0]
37 ; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2,4,4,6,6]
37 ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
3838 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca]
3939 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
4040 ; CHECK-NEXT: retq ## encoding: [0xc3]
5555 ; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3]
5656 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
5757 ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8]
58 ; CHECK-NEXT: ## xmm1 = xmm0[1,1,3,3]
58 ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3]
5959 ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0]
60 ; CHECK-NEXT: ## xmm0 = xmm0[1,1,3,3]
60 ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3]
6161 ; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0x74,0x08,0x58,0xca]
6262 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x58,0xc1]
6363 ; CHECK-NEXT: retq ## encoding: [0xc3]
7878 ; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
7979 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
8080 ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8]
81 ; CHECK-NEXT: ## ymm1 = ymm0[1,1,3,3,5,5,7,7]
81 ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
8282 ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0]
83 ; CHECK-NEXT: ## ymm0 = ymm0[1,1,3,3,5,5,7,7]
83 ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
8484 ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x74,0x28,0x58,0xca]
8585 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x58,0xc1]
8686 ; CHECK-NEXT: retq ## encoding: [0xc3]
100100 ; CHECK-NEXT: ## xmm2 = xmm0[0,0]
101101 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
102102 ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8]
103 ; CHECK-NEXT: ## xmm1 = xmm0[0,0]
103 ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0]
104104 ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0]
105 ; CHECK-NEXT: ## xmm0 = xmm0[0,0]
105 ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0]
106106 ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## encoding: [0x62,0xf1,0xf5,0x08,0x58,0xca]
107107 ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x58,0xc1]
108108 ; CHECK-NEXT: retq ## encoding: [0xc3]
123123 ; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2]
124124 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
125125 ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8]
126 ; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2]
126 ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2]
127127 ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0]
128 ; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2]
128 ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2]
129129 ; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0xf5,0x28,0x58,0xca]
130130 ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x58,0xc1]
131131 ; CHECK-NEXT: retq ## encoding: [0xc3]
5252 ; CHECK-LABEL: combine_vpermt2var_8f64_movddup_mask:
5353 ; CHECK: # BB#0:
5454 ; CHECK-NEXT: kmovw %edi, %k1
55 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
55 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
5656 ; CHECK-NEXT: retq
5757 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> , <8 x double> %x0, <8 x double> %x1, i8 %m)
5858 ret <8 x double> %res0
168168 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
169169 ; CHECK: # BB#0:
170170 ; CHECK-NEXT: kmovw %edi, %k1
171 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
171 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
172172 ; CHECK-NEXT: retq
173173 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m)
174174 ret <16 x float> %res0
195195 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
196196 ; CHECK: # BB#0:
197197 ; CHECK-NEXT: kmovw %edi, %k1
198 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
198 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
199199 ; CHECK-NEXT: retq
200200 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m)
201201 ret <16 x float> %res0
204204 ; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
205205 ; CHECK: # BB#0:
206206 ; CHECK-NEXT: kmovw %esi, %k1
207 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
207 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
208208 ; CHECK-NEXT: retq
209209 %x0 = load <16 x float>, <16 x float> *%p0
210210 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m)
3131 SCRUB_TRAILING_WHITESPACE_RE = re.compile(r'[ \t]+$', flags=re.M)
3232 SCRUB_X86_SHUFFLES_RE = (
3333 re.compile(
34 r'^(\s*\w+) [^#\n]+#+ ((?:[xyz]mm\d+|mem) = .*)$',
34 r'^(\s*\w+) [^#\n]+#+ ((?:[xyz]mm\d+|mem)( \{%k\d+\}( \{z\})?)? = .*)$',
3535 flags=re.M))
3636 SCRUB_X86_SP_RE = re.compile(r'\d+\(%(esp|rsp)\)')
3737 SCRUB_X86_RIP_RE = re.compile(r'[.\w]+\(%rip\)')
4646 SCRUB_TRAILING_WHITESPACE_RE = re.compile(r'[ \t]+$', flags=re.M)
4747 SCRUB_X86_SHUFFLES_RE = (
4848 re.compile(
49 r'^(\s*\w+) [^#\n]+#+ ((?:[xyz]mm\d+|mem) = .*)$',
49 r'^(\s*\w+) [^#\n]+#+ ((?:[xyz]mm\d+|mem)( \{%k\d+\}( \{z\})?)? = .*)$',
5050 flags=re.M))
5151 SCRUB_X86_SP_RE = re.compile(r'\d+\(%(esp|rsp)\)')
5252 SCRUB_X86_RIP_RE = re.compile(r'[.\w]+\(%rip\)')