llvm.org GIT mirror llvm / ebf3d94
[X86] Teach EVEX->VEX pass to turn VRNDSCALE into VROUND when bits 7:4 of the immediate are 0 and the regular EVEX->VEX checks pass. Bits 7:4 control the scale part of the operation. If the scale is 0 the behavior is equivalent to VROUND. Fixes PR36246 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@324985 91177308-0d34-0410-b5e6-96231b3b80d8 Craig Topper 2 years ago
12 changed file(s) with 114 addition(s) and 82 deletion(s). Raw diff Collapse all Expand all
163163 }
164164
165165 // Do any custom cleanup needed to finalize the conversion.
166 static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
166 static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
167167 (void)NewOpc;
168168 unsigned Opc = MI.getOpcode();
169169 switch (Opc) {
196196 Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
197197 break;
198198 }
199 }
199 case X86::VRNDSCALEPDZ128rri:
200 case X86::VRNDSCALEPDZ128rmi:
201 case X86::VRNDSCALEPSZ128rri:
202 case X86::VRNDSCALEPSZ128rmi:
203 case X86::VRNDSCALEPDZ256rri:
204 case X86::VRNDSCALEPDZ256rmi:
205 case X86::VRNDSCALEPSZ256rri:
206 case X86::VRNDSCALEPSZ256rmi:
207 case X86::VRNDSCALESDr:
208 case X86::VRNDSCALESDm:
209 case X86::VRNDSCALESSr:
210 case X86::VRNDSCALESSm:
211 case X86::VRNDSCALESDr_Int:
212 case X86::VRNDSCALESDm_Int:
213 case X86::VRNDSCALESSr_Int:
214 case X86::VRNDSCALESSm_Int:
215 const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
216 int64_t ImmVal = Imm.getImm();
217 // Ensure that only bits 3:0 of the immediate are used.
218 if ((ImmVal & 0xf) != ImmVal)
219 return false;
220 break;
221 }
222
223 return true;
200224 }
201225
202226
259283 if (usesExtendedRegister(MI))
260284 return false;
261285
262 performCustomAdjustments(MI, NewOpc);
286 if (!performCustomAdjustments(MI, NewOpc))
287 return false;
263288
264289 MI.setDesc(TII->get(NewOpc));
265290 MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
598598 ;
599599 ; AVX512VL-LABEL: test_x86_avx_round_pd_256:
600600 ; AVX512VL: # %bb.0:
601 ; AVX512VL-NEXT: vrndscalepd $7, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x09,0xc0,0x07]
601 ; AVX512VL-NEXT: vroundpd $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x09,0xc0,0x07]
602602 ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
603603 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) ; <<4 x double>> [#uses=1]
604604 ret <4 x double> %res
614614 ;
615615 ; AVX512VL-LABEL: test_x86_avx_round_ps_256:
616616 ; AVX512VL: # %bb.0:
617 ; AVX512VL-NEXT: vrndscaleps $7, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x08,0xc0,0x07]
617 ; AVX512VL-NEXT: vroundps $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x08,0xc0,0x07]
618618 ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
619619 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) ; <<8 x float>> [#uses=1]
620620 ret <8 x float> %res
41174117 ;
41184118 ; SKX-LABEL: test_roundpd:
41194119 ; SKX: # %bb.0:
4120 ; SKX-NEXT: vrndscalepd $7, %ymm0, %ymm0 # sched: [8:0.67]
4121 ; SKX-NEXT: vrndscalepd $7, (%rdi), %ymm1 # sched: [15:0.67]
4120 ; SKX-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [8:0.67]
4121 ; SKX-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [15:0.67]
41224122 ; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
41234123 ; SKX-NEXT: retq # sched: [7:1.00]
41244124 ;
41814181 ;
41824182 ; SKX-LABEL: test_roundps:
41834183 ; SKX: # %bb.0:
4184 ; SKX-NEXT: vrndscaleps $7, %ymm0, %ymm0 # sched: [8:0.67]
4185 ; SKX-NEXT: vrndscaleps $7, (%rdi), %ymm1 # sched: [15:0.67]
4184 ; SKX-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [8:0.67]
4185 ; SKX-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [15:0.67]
41864186 ; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
41874187 ; SKX-NEXT: retq # sched: [7:1.00]
41884188 ;
2525 define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) {
2626 ; CHECK-LABEL: test_rndscale_sd:
2727 ; CHECK: ## %bb.0:
28 ; CHECK-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0
28 ; CHECK-NEXT: vroundsd $11, %xmm1, %xmm0, %xmm0
2929 ; CHECK-NEXT: retq
3030 %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
3131 ret <2 x double>%res
6969 define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) {
7070 ; CHECK-LABEL: test_rndscale_ss:
7171 ; CHECK: ## %bb.0:
72 ; CHECK-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0
72 ; CHECK-NEXT: vroundss $11, %xmm1, %xmm0, %xmm0
7373 ; CHECK-NEXT: retq
7474 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
7575 ret <4 x float>%res
7878 define <4 x float> @test_rndscale_ss_load(<4 x float> %a, <4 x float>* %bptr) {
7979 ; CHECK-LABEL: test_rndscale_ss_load:
8080 ; CHECK: ## %bb.0:
81 ; CHECK-NEXT: vrndscaless $11, (%rdi), %xmm0, %xmm0
81 ; CHECK-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0
8282 ; CHECK-NEXT: retq
8383 %b = load <4 x float>, <4 x float>* %bptr
8484 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
5454 define float @test_trunc(float %a) {
5555 ; AVX512-LABEL: test_trunc:
5656 ; AVX512: ## %bb.0:
57 ; AVX512-NEXT: vrndscaless $11, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x0a,0xc0,0x0b]
57 ; AVX512-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0a,0xc0,0x0b]
5858 ; AVX512-NEXT: retq ## encoding: [0xc3]
5959 ;
6060 ; AVX-LABEL: test_trunc:
8282 define float @test_rint(float %a) {
8383 ; AVX512-LABEL: test_rint:
8484 ; AVX512: ## %bb.0:
85 ; AVX512-NEXT: vrndscaless $4, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x0a,0xc0,0x04]
85 ; AVX512-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0a,0xc0,0x04]
8686 ; AVX512-NEXT: retq ## encoding: [0xc3]
8787 ;
8888 ; AVX-LABEL: test_rint:
27002700 ; CHECK: ## %bb.0:
27012701 ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
27022702 ; CHECK-NEXT: vrndscaleps $88, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x08,0xc8,0x58]
2703 ; CHECK-NEXT: vrndscaleps $4, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x08,0xc0,0x04]
2703 ; CHECK-NEXT: vroundps $4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x04]
27042704 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0]
27052705 ; CHECK-NEXT: retq ## encoding: [0xc3]
27062706 %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3)
1515 ;
1616 ; CHECK-AVX512-LABEL: test1:
1717 ; CHECK-AVX512: ## %bb.0:
18 ; CHECK-AVX512-NEXT: vrndscaless $9, %xmm0, %xmm0, %xmm0
18 ; CHECK-AVX512-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
1919 ; CHECK-AVX512-NEXT: retq
2020 %call = tail call float @floorf(float %x) nounwind readnone
2121 ret float %call
3636 ;
3737 ; CHECK-AVX512-LABEL: test2:
3838 ; CHECK-AVX512: ## %bb.0:
39 ; CHECK-AVX512-NEXT: vrndscalesd $9, %xmm0, %xmm0, %xmm0
39 ; CHECK-AVX512-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
4040 ; CHECK-AVX512-NEXT: retq
4141 %call = tail call double @floor(double %x) nounwind readnone
4242 ret double %call
5757 ;
5858 ; CHECK-AVX512-LABEL: test3:
5959 ; CHECK-AVX512: ## %bb.0:
60 ; CHECK-AVX512-NEXT: vrndscaless $12, %xmm0, %xmm0, %xmm0
60 ; CHECK-AVX512-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
6161 ; CHECK-AVX512-NEXT: retq
6262 %call = tail call float @nearbyintf(float %x) nounwind readnone
6363 ret float %call
7878 ;
7979 ; CHECK-AVX512-LABEL: test4:
8080 ; CHECK-AVX512: ## %bb.0:
81 ; CHECK-AVX512-NEXT: vrndscalesd $12, %xmm0, %xmm0, %xmm0
81 ; CHECK-AVX512-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0
8282 ; CHECK-AVX512-NEXT: retq
8383 %call = tail call double @nearbyint(double %x) nounwind readnone
8484 ret double %call
9999 ;
100100 ; CHECK-AVX512-LABEL: test5:
101101 ; CHECK-AVX512: ## %bb.0:
102 ; CHECK-AVX512-NEXT: vrndscaless $10, %xmm0, %xmm0, %xmm0
102 ; CHECK-AVX512-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
103103 ; CHECK-AVX512-NEXT: retq
104104 %call = tail call float @ceilf(float %x) nounwind readnone
105105 ret float %call
120120 ;
121121 ; CHECK-AVX512-LABEL: test6:
122122 ; CHECK-AVX512: ## %bb.0:
123 ; CHECK-AVX512-NEXT: vrndscalesd $10, %xmm0, %xmm0, %xmm0
123 ; CHECK-AVX512-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
124124 ; CHECK-AVX512-NEXT: retq
125125 %call = tail call double @ceil(double %x) nounwind readnone
126126 ret double %call
141141 ;
142142 ; CHECK-AVX512-LABEL: test7:
143143 ; CHECK-AVX512: ## %bb.0:
144 ; CHECK-AVX512-NEXT: vrndscaless $4, %xmm0, %xmm0, %xmm0
144 ; CHECK-AVX512-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
145145 ; CHECK-AVX512-NEXT: retq
146146 %call = tail call float @rintf(float %x) nounwind readnone
147147 ret float %call
162162 ;
163163 ; CHECK-AVX512-LABEL: test8:
164164 ; CHECK-AVX512: ## %bb.0:
165 ; CHECK-AVX512-NEXT: vrndscalesd $4, %xmm0, %xmm0, %xmm0
165 ; CHECK-AVX512-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0
166166 ; CHECK-AVX512-NEXT: retq
167167 %call = tail call double @rint(double %x) nounwind readnone
168168 ret double %call
183183 ;
184184 ; CHECK-AVX512-LABEL: test9:
185185 ; CHECK-AVX512: ## %bb.0:
186 ; CHECK-AVX512-NEXT: vrndscaless $11, %xmm0, %xmm0, %xmm0
186 ; CHECK-AVX512-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
187187 ; CHECK-AVX512-NEXT: retq
188188 %call = tail call float @truncf(float %x) nounwind readnone
189189 ret float %call
204204 ;
205205 ; CHECK-AVX512-LABEL: test10:
206206 ; CHECK-AVX512: ## %bb.0:
207 ; CHECK-AVX512-NEXT: vrndscalesd $11, %xmm0, %xmm0, %xmm0
207 ; CHECK-AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
208208 ; CHECK-AVX512-NEXT: retq
209209 %call = tail call double @trunc(double %x) nounwind readnone
210210 ret double %call
457457 ;
458458 ; SKX-LABEL: test_x86_sse41_round_pd:
459459 ; SKX: ## %bb.0:
460 ; SKX-NEXT: vrndscalepd $7, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x09,0xc0,0x07]
460 ; SKX-NEXT: vroundpd $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07]
461461 ; SKX-NEXT: retl ## encoding: [0xc3]
462462 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
463463 ret <2 x double> %res
478478 ;
479479 ; SKX-LABEL: test_x86_sse41_round_ps:
480480 ; SKX: ## %bb.0:
481 ; SKX-NEXT: vrndscaleps $7, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x08,0xc0,0x07]
481 ; SKX-NEXT: vroundps $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07]
482482 ; SKX-NEXT: retl ## encoding: [0xc3]
483483 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
484484 ret <4 x float> %res
499499 ;
500500 ; SKX-LABEL: test_x86_sse41_round_sd:
501501 ; SKX: ## %bb.0:
502 ; SKX-NEXT: vrndscalesd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x0b,0xc1,0x07]
502 ; SKX-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07]
503503 ; SKX-NEXT: retl ## encoding: [0xc3]
504504 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
505505 ret <2 x double> %res
523523 ; SKX-LABEL: test_x86_sse41_round_sd_load:
524524 ; SKX: ## %bb.0:
525525 ; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
526 ; SKX-NEXT: vrndscalesd $7, (%eax), %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x0b,0x00,0x07]
526 ; SKX-NEXT: vroundsd $7, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0b,0x00,0x07]
527527 ; SKX-NEXT: retl ## encoding: [0xc3]
528528 %a1b = load <2 x double>, <2 x double>* %a1
529529 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1b, i32 7) ; <<2 x double>> [#uses=1]
544544 ;
545545 ; SKX-LABEL: test_x86_sse41_round_ss:
546546 ; SKX: ## %bb.0:
547 ; SKX-NEXT: vrndscaless $7, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x0a,0xc1,0x07]
547 ; SKX-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07]
548548 ; SKX-NEXT: retl ## encoding: [0xc3]
549549 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
550550 ret <4 x float> %res
30233023 ;
30243024 ; SKX-LABEL: test_roundpd:
30253025 ; SKX: # %bb.0:
3026 ; SKX-NEXT: vrndscalepd $7, %xmm0, %xmm0 # sched: [8:0.67]
3027 ; SKX-NEXT: vrndscalepd $7, (%rdi), %xmm1 # sched: [14:0.67]
3026 ; SKX-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [8:0.67]
3027 ; SKX-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [14:0.67]
30283028 ; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
30293029 ; SKX-NEXT: retq # sched: [7:1.00]
30303030 ;
30953095 ;
30963096 ; SKX-LABEL: test_roundps:
30973097 ; SKX: # %bb.0:
3098 ; SKX-NEXT: vrndscaleps $7, %xmm0, %xmm0 # sched: [8:0.67]
3099 ; SKX-NEXT: vrndscaleps $7, (%rdi), %xmm1 # sched: [14:0.67]
3098 ; SKX-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [8:0.67]
3099 ; SKX-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [14:0.67]
31003100 ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
31013101 ; SKX-NEXT: retq # sched: [7:1.00]
31023102 ;
31683168 ;
31693169 ; SKX-LABEL: test_roundsd:
31703170 ; SKX: # %bb.0:
3171 ; SKX-NEXT: vrndscalesd $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
3172 ; SKX-NEXT: vrndscalesd $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
3171 ; SKX-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
3172 ; SKX-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
31733173 ; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
31743174 ; SKX-NEXT: retq # sched: [7:1.00]
31753175 ;
32413241 ;
32423242 ; SKX-LABEL: test_roundss:
32433243 ; SKX: # %bb.0:
3244 ; SKX-NEXT: vrndscaless $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
3245 ; SKX-NEXT: vrndscaless $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
3244 ; SKX-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
3245 ; SKX-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
32463246 ; SKX-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
32473247 ; SKX-NEXT: retq # sched: [7:1.00]
32483248 ;
1515 ;
1616 ; AVX512-LABEL: floor_v2f64:
1717 ; AVX512: ## %bb.0:
18 ; AVX512-NEXT: vrndscalepd $9, %xmm0, %xmm0
18 ; AVX512-NEXT: vroundpd $9, %xmm0, %xmm0
1919 ; AVX512-NEXT: retq
2020 %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
2121 ret <2 x double> %t
3535 ;
3636 ; AVX512-LABEL: floor_v4f32:
3737 ; AVX512: ## %bb.0:
38 ; AVX512-NEXT: vrndscaleps $9, %xmm0, %xmm0
38 ; AVX512-NEXT: vroundps $9, %xmm0, %xmm0
3939 ; AVX512-NEXT: retq
4040 %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
4141 ret <4 x float> %t
5656 ;
5757 ; AVX512-LABEL: floor_v4f64:
5858 ; AVX512: ## %bb.0:
59 ; AVX512-NEXT: vrndscalepd $9, %ymm0, %ymm0
59 ; AVX512-NEXT: vroundpd $9, %ymm0, %ymm0
6060 ; AVX512-NEXT: retq
6161 %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
6262 ret <4 x double> %t
7777 ;
7878 ; AVX512-LABEL: floor_v8f32:
7979 ; AVX512: ## %bb.0:
80 ; AVX512-NEXT: vrndscaleps $9, %ymm0, %ymm0
80 ; AVX512-NEXT: vroundps $9, %ymm0, %ymm0
8181 ; AVX512-NEXT: retq
8282 %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
8383 ret <8 x float> %t
145145 ;
146146 ; AVX512-LABEL: ceil_v2f64:
147147 ; AVX512: ## %bb.0:
148 ; AVX512-NEXT: vrndscalepd $10, %xmm0, %xmm0
148 ; AVX512-NEXT: vroundpd $10, %xmm0, %xmm0
149149 ; AVX512-NEXT: retq
150150 %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
151151 ret <2 x double> %t
165165 ;
166166 ; AVX512-LABEL: ceil_v4f32:
167167 ; AVX512: ## %bb.0:
168 ; AVX512-NEXT: vrndscaleps $10, %xmm0, %xmm0
168 ; AVX512-NEXT: vroundps $10, %xmm0, %xmm0
169169 ; AVX512-NEXT: retq
170170 %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
171171 ret <4 x float> %t
186186 ;
187187 ; AVX512-LABEL: ceil_v4f64:
188188 ; AVX512: ## %bb.0:
189 ; AVX512-NEXT: vrndscalepd $10, %ymm0, %ymm0
189 ; AVX512-NEXT: vroundpd $10, %ymm0, %ymm0
190190 ; AVX512-NEXT: retq
191191 %t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
192192 ret <4 x double> %t
207207 ;
208208 ; AVX512-LABEL: ceil_v8f32:
209209 ; AVX512: ## %bb.0:
210 ; AVX512-NEXT: vrndscaleps $10, %ymm0, %ymm0
210 ; AVX512-NEXT: vroundps $10, %ymm0, %ymm0
211211 ; AVX512-NEXT: retq
212212 %t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
213213 ret <8 x float> %t
275275 ;
276276 ; AVX512-LABEL: trunc_v2f64:
277277 ; AVX512: ## %bb.0:
278 ; AVX512-NEXT: vrndscalepd $11, %xmm0, %xmm0
278 ; AVX512-NEXT: vroundpd $11, %xmm0, %xmm0
279279 ; AVX512-NEXT: retq
280280 %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
281281 ret <2 x double> %t
295295 ;
296296 ; AVX512-LABEL: trunc_v4f32:
297297 ; AVX512: ## %bb.0:
298 ; AVX512-NEXT: vrndscaleps $11, %xmm0, %xmm0
298 ; AVX512-NEXT: vroundps $11, %xmm0, %xmm0
299299 ; AVX512-NEXT: retq
300300 %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
301301 ret <4 x float> %t
316316 ;
317317 ; AVX512-LABEL: trunc_v4f64:
318318 ; AVX512: ## %bb.0:
319 ; AVX512-NEXT: vrndscalepd $11, %ymm0, %ymm0
319 ; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0
320320 ; AVX512-NEXT: retq
321321 %t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
322322 ret <4 x double> %t
337337 ;
338338 ; AVX512-LABEL: trunc_v8f32:
339339 ; AVX512: ## %bb.0:
340 ; AVX512-NEXT: vrndscaleps $11, %ymm0, %ymm0
340 ; AVX512-NEXT: vroundps $11, %ymm0, %ymm0
341341 ; AVX512-NEXT: retq
342342 %t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
343343 ret <8 x float> %t
405405 ;
406406 ; AVX512-LABEL: rint_v2f64:
407407 ; AVX512: ## %bb.0:
408 ; AVX512-NEXT: vrndscalepd $4, %xmm0, %xmm0
408 ; AVX512-NEXT: vroundpd $4, %xmm0, %xmm0
409409 ; AVX512-NEXT: retq
410410 %t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
411411 ret <2 x double> %t
425425 ;
426426 ; AVX512-LABEL: rint_v4f32:
427427 ; AVX512: ## %bb.0:
428 ; AVX512-NEXT: vrndscaleps $4, %xmm0, %xmm0
428 ; AVX512-NEXT: vroundps $4, %xmm0, %xmm0
429429 ; AVX512-NEXT: retq
430430 %t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
431431 ret <4 x float> %t
446446 ;
447447 ; AVX512-LABEL: rint_v4f64:
448448 ; AVX512: ## %bb.0:
449 ; AVX512-NEXT: vrndscalepd $4, %ymm0, %ymm0
449 ; AVX512-NEXT: vroundpd $4, %ymm0, %ymm0
450450 ; AVX512-NEXT: retq
451451 %t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
452452 ret <4 x double> %t
467467 ;
468468 ; AVX512-LABEL: rint_v8f32:
469469 ; AVX512: ## %bb.0:
470 ; AVX512-NEXT: vrndscaleps $4, %ymm0, %ymm0
470 ; AVX512-NEXT: vroundps $4, %ymm0, %ymm0
471471 ; AVX512-NEXT: retq
472472 %t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
473473 ret <8 x float> %t
535535 ;
536536 ; AVX512-LABEL: nearbyint_v2f64:
537537 ; AVX512: ## %bb.0:
538 ; AVX512-NEXT: vrndscalepd $12, %xmm0, %xmm0
538 ; AVX512-NEXT: vroundpd $12, %xmm0, %xmm0
539539 ; AVX512-NEXT: retq
540540 %t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
541541 ret <2 x double> %t
555555 ;
556556 ; AVX512-LABEL: nearbyint_v4f32:
557557 ; AVX512: ## %bb.0:
558 ; AVX512-NEXT: vrndscaleps $12, %xmm0, %xmm0
558 ; AVX512-NEXT: vroundps $12, %xmm0, %xmm0
559559 ; AVX512-NEXT: retq
560560 %t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
561561 ret <4 x float> %t
576576 ;
577577 ; AVX512-LABEL: nearbyint_v4f64:
578578 ; AVX512: ## %bb.0:
579 ; AVX512-NEXT: vrndscalepd $12, %ymm0, %ymm0
579 ; AVX512-NEXT: vroundpd $12, %ymm0, %ymm0
580580 ; AVX512-NEXT: retq
581581 %t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
582582 ret <4 x double> %t
597597 ;
598598 ; AVX512-LABEL: nearbyint_v8f32:
599599 ; AVX512: ## %bb.0:
600 ; AVX512-NEXT: vrndscaleps $12, %ymm0, %ymm0
600 ; AVX512-NEXT: vroundps $12, %ymm0, %ymm0
601601 ; AVX512-NEXT: retq
602602 %t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
603603 ret <8 x float> %t
175175 ; X64-NEXT: roundss $4, (%rdi), %xmm0
176176 ; X64-NEXT: retq
177177 ;
178 ; X32_AVX1-LABEL: test3:
179 ; X32_AVX1: ## %bb.0:
180 ; X32_AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
181 ; X32_AVX1-NEXT: vroundss $4, (%eax), %xmm0, %xmm0
182 ; X32_AVX1-NEXT: retl
183 ;
184 ; X64_AVX1-LABEL: test3:
185 ; X64_AVX1: ## %bb.0:
186 ; X64_AVX1-NEXT: vroundss $4, (%rdi), %xmm0, %xmm0
187 ; X64_AVX1-NEXT: retq
188 ;
189 ; X32_AVX512-LABEL: test3:
190 ; X32_AVX512: ## %bb.0:
191 ; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
192 ; X32_AVX512-NEXT: vrndscaless $4, (%eax), %xmm0, %xmm0
193 ; X32_AVX512-NEXT: retl
194 ;
195 ; X64_AVX512-LABEL: test3:
196 ; X64_AVX512: ## %bb.0:
197 ; X64_AVX512-NEXT: vrndscaless $4, (%rdi), %xmm0, %xmm0
198 ; X64_AVX512-NEXT: retq
178 ; X32_AVX-LABEL: test3:
179 ; X32_AVX: ## %bb.0:
180 ; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
181 ; X32_AVX-NEXT: vroundss $4, (%eax), %xmm0, %xmm0
182 ; X32_AVX-NEXT: retl
183 ;
184 ; X64_AVX-LABEL: test3:
185 ; X64_AVX: ## %bb.0:
186 ; X64_AVX-NEXT: vroundss $4, (%rdi), %xmm0, %xmm0
187 ; X64_AVX-NEXT: retq
199188 %a = load float , float *%b
200189 %B = insertelement <4 x float> undef, float %a, i32 0
201190 %X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4)
253242 ; X32_AVX512-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill
254243 ; X32_AVX512-NEXT: calll _f
255244 ; X32_AVX512-NEXT: vmovaps (%esp), %xmm1 ## 16-byte Reload
256 ; X32_AVX512-NEXT: vrndscaless $4, %xmm1, %xmm0, %xmm0
245 ; X32_AVX512-NEXT: vroundss $4, %xmm1, %xmm0, %xmm0
257246 ; X32_AVX512-NEXT: addl $28, %esp
258247 ; X32_AVX512-NEXT: retl
259248 ;
264253 ; X64_AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
265254 ; X64_AVX512-NEXT: callq _f
266255 ; X64_AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
267 ; X64_AVX512-NEXT: vrndscaless $4, %xmm1, %xmm0, %xmm0
256 ; X64_AVX512-NEXT: vroundss $4, %xmm1, %xmm0, %xmm0
268257 ; X64_AVX512-NEXT: addq $24, %rsp
269258 ; X64_AVX512-NEXT: retq
270259 %a = load float , float *%b
6666 "VPMULLQ",
6767 "VPSRAQ",
6868 "VDBPSADBW",
69 "VRNDSCALE",
7069 "VSCALEFPS"
7170 };
7271 // Instruction's name starts with one of the entries in the exception list
162161 {"VSHUFI32X4Z256rri", "VPERM2I128rr", false},
163162 {"VSHUFI64X2Z256rmi", "VPERM2I128rm", false},
164163 {"VSHUFI64X2Z256rri", "VPERM2I128rr", false},
164
165 // These can be replaced if we verify the scale part of the immediate is
166 // zero.
167 {"VRNDSCALEPDZ128rri", "VROUNDPDr", true},
168 {"VRNDSCALEPDZ128rmi", "VROUNDPDm", true},
169 {"VRNDSCALEPSZ128rri", "VROUNDPSr", true},
170 {"VRNDSCALEPSZ128rmi", "VROUNDPSm", true},
171 {"VRNDSCALEPDZ256rri", "VROUNDYPDr", false},
172 {"VRNDSCALEPDZ256rmi", "VROUNDYPDm", false},
173 {"VRNDSCALEPSZ256rri", "VROUNDYPSr", false},
174 {"VRNDSCALEPSZ256rmi", "VROUNDYPSm", false},
175 {"VRNDSCALESDr", "VROUNDSDr", true},
176 {"VRNDSCALESDm", "VROUNDSDm", true},
177 {"VRNDSCALESSr", "VROUNDSSr", true},
178 {"VRNDSCALESSm", "VROUNDSSm", true},
179 {"VRNDSCALESDr_Int", "VROUNDSDr_Int", true},
180 {"VRNDSCALESDm_Int", "VROUNDSDm_Int", true},
181 {"VRNDSCALESSr_Int", "VROUNDSSr_Int", true},
182 {"VRNDSCALESSm_Int", "VROUNDSSm_Int", true},
165183 };
166184
167185 // Print the manually added entries