llvm.org GIT mirror llvm / 27ec2b8
Merging r293259: ------------------------------------------------------------------------ r293259 | compnerd | 2017-01-26 19:41:53 -0800 (Thu, 26 Jan 2017) | 11 lines ARM: fix vectorized division on WoA The Windows on ARM target uses custom division for normal division as the backend needs to insert division-by-zero checks. However, it is designed to only handle non-vectorized division. ARM has custom lowering for vectorized division as that can avoid loading registers with the values and invoke a division routine for each one, preferring to lower using NEON instructions. Fall back to the custom lowering for the NEON instructions if we encounter a vectorized division. Resolves PR31778! ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_40@293306 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 2 years ago
2 changed file(s) with 48 addition(s) and 39 deletion(s). Raw diff Collapse all Expand all
75707570 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
75717571 case ISD::MUL: return LowerMUL(Op, DAG);
75727572 case ISD::SDIV:
7573 if (Subtarget->isTargetWindows())
7573 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
75747574 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
75757575 return LowerSDIV(Op, DAG);
75767576 case ISD::UDIV:
7577 if (Subtarget->isTargetWindows())
7577 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
75787578 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
75797579 return LowerUDIV(Op, DAG);
75807580 case ISD::ADDC:
None ; RUN: llc -mtriple=arm-eabi -mattr=+neon -pre-RA-sched=source -disable-post-ra %s -o - \
1 ; RUN: | FileCheck %s
0 ; RUN: llc -mtriple arm-eabi -mattr=+neon -disable-post-ra -pre-RA-sched source %s -o - | FileCheck %s
1 ; RUN: llc -mtriple thumbv7-windows-itanium -mattr=+neon -disable-post-ra -pre-RA-sched source %s -o - | FileCheck %s
22
33 define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4 ;CHECK: vrecpe.f32
5 ;CHECK: vmovn.i32
6 ;CHECK: vrecpe.f32
7 ;CHECK: vmovn.i32
8 ;CHECK: vmovn.i16
9 %tmp1 = load <8 x i8>, <8 x i8>* %A
10 %tmp2 = load <8 x i8>, <8 x i8>* %B
11 %tmp3 = sdiv <8 x i8> %tmp1, %tmp2
12 ret <8 x i8> %tmp3
4 %tmp1 = load <8 x i8>, <8 x i8>* %A
5 %tmp2 = load <8 x i8>, <8 x i8>* %B
6 %tmp3 = sdiv <8 x i8> %tmp1, %tmp2
7 ret <8 x i8> %tmp3
138 }
149
10 ; CHECK-LABEL: sdivi8:
11 ; CHECK: vrecpe.f32
12 ; CHECK: vmovn.i32
13 ; CHECK: vrecpe.f32
14 ; CHECK: vmovn.i32
15 ; CHECK: vmovn.i16
16
1517 define <8 x i8> @udivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
16 ;CHECK: vrecpe.f32
17 ;CHECK: vrecps.f32
18 ;CHECK: vmovn.i32
19 ;CHECK: vrecpe.f32
20 ;CHECK: vrecps.f32
21 ;CHECK: vmovn.i32
22 ;CHECK: vqmovun.s16
23 %tmp1 = load <8 x i8>, <8 x i8>* %A
24 %tmp2 = load <8 x i8>, <8 x i8>* %B
25 %tmp3 = udiv <8 x i8> %tmp1, %tmp2
26 ret <8 x i8> %tmp3
18 %tmp1 = load <8 x i8>, <8 x i8>* %A
19 %tmp2 = load <8 x i8>, <8 x i8>* %B
20 %tmp3 = udiv <8 x i8> %tmp1, %tmp2
21 ret <8 x i8> %tmp3
2722 }
2823
24 ; CHECK-LABEL: udivi8:
25 ; CHECK: vrecpe.f32
26 ; CHECK: vrecps.f32
27 ; CHECK: vmovn.i32
28 ; CHECK: vrecpe.f32
29 ; CHECK: vrecps.f32
30 ; CHECK: vmovn.i32
31 ; CHECK: vqmovun.s16
32
2933 define <4 x i16> @sdivi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
30 ;CHECK: vrecpe.f32
31 ;CHECK: vrecps.f32
32 ;CHECK: vmovn.i32
33 %tmp1 = load <4 x i16>, <4 x i16>* %A
34 %tmp2 = load <4 x i16>, <4 x i16>* %B
35 %tmp3 = sdiv <4 x i16> %tmp1, %tmp2
36 ret <4 x i16> %tmp3
34 %tmp1 = load <4 x i16>, <4 x i16>* %A
35 %tmp2 = load <4 x i16>, <4 x i16>* %B
36 %tmp3 = sdiv <4 x i16> %tmp1, %tmp2
37 ret <4 x i16> %tmp3
3738 }
3839
40 ; CHECK-LABEL: sdivi16:
41 ; CHECK: vrecpe.f32
42 ; CHECK: vrecps.f32
43 ; CHECK: vmovn.i32
44
3945 define <4 x i16> @udivi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
40 ;CHECK: vrecpe.f32
41 ;CHECK: vrecps.f32
42 ;CHECK: vrecps.f32
43 ;CHECK: vmovn.i32
44 %tmp1 = load <4 x i16>, <4 x i16>* %A
45 %tmp2 = load <4 x i16>, <4 x i16>* %B
46 %tmp3 = udiv <4 x i16> %tmp1, %tmp2
47 ret <4 x i16> %tmp3
46 %tmp1 = load <4 x i16>, <4 x i16>* %A
47 %tmp2 = load <4 x i16>, <4 x i16>* %B
48 %tmp3 = udiv <4 x i16> %tmp1, %tmp2
49 ret <4 x i16> %tmp3
4850 }
51
52 ; CHECK-LABEL: udivi16:
53 ; CHECK: vrecpe.f32
54 ; CHECK: vrecps.f32
55 ; CHECK: vrecps.f32
56 ; CHECK: vmovn.i32
57