llvm.org GIT mirror llvm / 7246950
[X86] Tune bypassing of slow division for Intel CPUs 64-bit integer division in Intel CPUs is extremely slow, much slower than 32-bit division. On the other hand, 8-bit and 16-bit divisions aren't any faster. The only important exception is Atom where DIV8 is fastest. Because of that, the patch 1) Enables bypassing of 64-bit division for Atom, Silvermont and all big cores. 2) Modifies 64-bit bypassing to use 32-bit division instead of 16-bit one. This doesn't make the shorter division slower but increases chances of taking it. Moreover, it's much more likely to prove at compile-time that a value fits 32 bits and doesn't require a run-time check (e.g. zext i32 to i64). Differential Revision: https://reviews.llvm.org/D28196 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291800 91177308-0d34-0410-b5e6-96231b3b80d8 Nikolai Bozhenov 3 years ago
5 changed file(s) with 89 addition(s) and 20 deletion(s). Raw diff Collapse all Expand all
208208 def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
209209 "HasSlowDivide32", "true",
210210 "Use 8-bit divide for positive values less than 256">;
211 def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw",
211 def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
212212 "HasSlowDivide64", "true",
213 "Use 16-bit divide for positive values less than 65536">;
213 "Use 32-bit divide for positive values less than 2^32">;
214214 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
215215 "PadShortFunctions", "true",
216216 "Pad short functions">;
460460 FeatureCMPXCHG16B,
461461 FeaturePOPCNT,
462462 FeatureAES,
463 FeatureSlowDivide64,
463464 FeaturePCLMUL,
464465 FeatureXSAVE,
465466 FeatureXSAVEOPT,
9696 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
9797 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
9898
99 // Bypass expensive divides on Atom when compiling with O2.
99 // Bypass expensive divides and use cheaper ones.
100100 if (TM.getOptLevel() >= CodeGenOpt::Default) {
101101 if (Subtarget.hasSlowDivide32())
102102 addBypassSlowDiv(32, 8);
103103 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
104 addBypassSlowDiv(64, 16);
104 addBypassSlowDiv(64, 32);
105105 }
106106
107107 if (Subtarget.isTargetKnownWindowsMSVC() ||
215215 /// 32-bit divisions and should be used when possible.
216216 bool HasSlowDivide32;
217217
218 /// True if 16-bit divides are significantly faster than
218 /// True if 32-bit divides are significantly faster than
219219 /// 64-bit divisions and should be used when possible.
220220 bool HasSlowDivide64;
221221
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
11 ; RUN: llc < %s -mcpu=atom -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
2 ; RUN: llc < %s -mcpu=sandybridge -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=SNB
23
34 ; Additional tests for 64-bit divide bypass
45
67 ; CHECK-LABEL: Test_get_quotient:
78 ; CHECK: # BB#0:
89 ; CHECK-NEXT: movq %rdi, %rax
10 ; CHECK-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
911 ; CHECK-NEXT: orq %rsi, %rax
10 ; CHECK-NEXT: testq $-65536, %rax # imm = 0xFFFF0000
12 ; CHECK-NEXT: testq %rcx, %rax
1113 ; CHECK-NEXT: je .LBB0_1
1214 ; CHECK-NEXT: # BB#2:
1315 ; CHECK-NEXT: movq %rdi, %rax
1719 ; CHECK-NEXT: .LBB0_1:
1820 ; CHECK-NEXT: xorl %edx, %edx
1921 ; CHECK-NEXT: movl %edi, %eax
20 ; CHECK-NEXT: divw %si
21 ; CHECK-NEXT: movzwl %ax, %eax
22 ; CHECK-NEXT: divl %esi
23 ; CHECK-NEXT: # kill: %EAX %EAX %RAX
2224 ; CHECK-NEXT: retq
25 ;
26 ; SNB-LABEL: Test_get_quotient:
27 ; SNB: # BB#0:
28 ; SNB-NEXT: movq %rdi, %rax
29 ; SNB-NEXT: orq %rsi, %rax
30 ; SNB-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
31 ; SNB-NEXT: testq %rcx, %rax
32 ; SNB-NEXT: je .LBB0_1
33 ; SNB-NEXT: # BB#2:
34 ; SNB-NEXT: movq %rdi, %rax
35 ; SNB-NEXT: cqto
36 ; SNB-NEXT: idivq %rsi
37 ; SNB-NEXT: retq
38 ; SNB-NEXT: .LBB0_1:
39 ; SNB-NEXT: xorl %edx, %edx
40 ; SNB-NEXT: movl %edi, %eax
41 ; SNB-NEXT: divl %esi
42 ; SNB-NEXT: # kill: %EAX %EAX %RAX
43 ; SNB-NEXT: retq
2344 %result = sdiv i64 %a, %b
2445 ret i64 %result
2546 }
2849 ; CHECK-LABEL: Test_get_remainder:
2950 ; CHECK: # BB#0:
3051 ; CHECK-NEXT: movq %rdi, %rax
52 ; CHECK-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
3153 ; CHECK-NEXT: orq %rsi, %rax
32 ; CHECK-NEXT: testq $-65536, %rax # imm = 0xFFFF0000
54 ; CHECK-NEXT: testq %rcx, %rax
3355 ; CHECK-NEXT: je .LBB1_1
3456 ; CHECK-NEXT: # BB#2:
3557 ; CHECK-NEXT: movq %rdi, %rax
4062 ; CHECK-NEXT: .LBB1_1:
4163 ; CHECK-NEXT: xorl %edx, %edx
4264 ; CHECK-NEXT: movl %edi, %eax
43 ; CHECK-NEXT: divw %si
44 ; CHECK-NEXT: movzwl %dx, %eax
65 ; CHECK-NEXT: divl %esi
66 ; CHECK-NEXT: # kill: %EDX %EDX %RDX
67 ; CHECK-NEXT: movq %rdx, %rax
4568 ; CHECK-NEXT: retq
69 ;
70 ; SNB-LABEL: Test_get_remainder:
71 ; SNB: # BB#0:
72 ; SNB-NEXT: movq %rdi, %rax
73 ; SNB-NEXT: orq %rsi, %rax
74 ; SNB-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
75 ; SNB-NEXT: testq %rcx, %rax
76 ; SNB-NEXT: je .LBB1_1
77 ; SNB-NEXT: # BB#2:
78 ; SNB-NEXT: movq %rdi, %rax
79 ; SNB-NEXT: cqto
80 ; SNB-NEXT: idivq %rsi
81 ; SNB-NEXT: movq %rdx, %rax
82 ; SNB-NEXT: retq
83 ; SNB-NEXT: .LBB1_1:
84 ; SNB-NEXT: xorl %edx, %edx
85 ; SNB-NEXT: movl %edi, %eax
86 ; SNB-NEXT: divl %esi
87 ; SNB-NEXT: # kill: %EDX %EDX %RDX
88 ; SNB-NEXT: movq %rdx, %rax
89 ; SNB-NEXT: retq
4690 %result = srem i64 %a, %b
4791 ret i64 %result
4892 }
5195 ; CHECK-LABEL: Test_get_quotient_and_remainder:
5296 ; CHECK: # BB#0:
5397 ; CHECK-NEXT: movq %rdi, %rax
98 ; CHECK-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
5499 ; CHECK-NEXT: orq %rsi, %rax
55 ; CHECK-NEXT: testq $-65536, %rax # imm = 0xFFFF0000
100 ; CHECK-NEXT: testq %rcx, %rax
56101 ; CHECK-NEXT: je .LBB2_1
57102 ; CHECK-NEXT: # BB#2:
58103 ; CHECK-NEXT: movq %rdi, %rax
63108 ; CHECK-NEXT: .LBB2_1:
64109 ; CHECK-NEXT: xorl %edx, %edx
65110 ; CHECK-NEXT: movl %edi, %eax
66 ; CHECK-NEXT: divw %si
67 ; CHECK-NEXT: movzwl %ax, %eax
68 ; CHECK-NEXT: movzwl %dx, %edx
111 ; CHECK-NEXT: divl %esi
112 ; CHECK-NEXT: # kill: %EAX %EAX %RAX
113 ; CHECK-NEXT: # kill: %EDX %EDX %RDX
69114 ; CHECK-NEXT: addq %rdx, %rax
70115 ; CHECK-NEXT: retq
116 ;
117 ; SNB-LABEL: Test_get_quotient_and_remainder:
118 ; SNB: # BB#0:
119 ; SNB-NEXT: movq %rdi, %rax
120 ; SNB-NEXT: orq %rsi, %rax
121 ; SNB-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
122 ; SNB-NEXT: testq %rcx, %rax
123 ; SNB-NEXT: je .LBB2_1
124 ; SNB-NEXT: # BB#2:
125 ; SNB-NEXT: movq %rdi, %rax
126 ; SNB-NEXT: cqto
127 ; SNB-NEXT: idivq %rsi
128 ; SNB-NEXT: addq %rdx, %rax
129 ; SNB-NEXT: retq
130 ; SNB-NEXT: .LBB2_1:
131 ; SNB-NEXT: xorl %edx, %edx
132 ; SNB-NEXT: movl %edi, %eax
133 ; SNB-NEXT: divl %esi
134 ; SNB-NEXT: # kill: %EDX %EDX %RDX
135 ; SNB-NEXT: # kill: %EAX %EAX %RAX
136 ; SNB-NEXT: addq %rdx, %rax
137 ; SNB-NEXT: retq
71138 %resultdiv = sdiv i64 %a, %b
72139 %resultrem = srem i64 %a, %b
73140 %result = add i64 %resultdiv, %resultrem
0 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivl-to-divb < %s | FileCheck -check-prefix=DIV32 %s
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divw < %s | FileCheck -check-prefix=DIV64 %s
1 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divl < %s | FileCheck -check-prefix=DIV64 %s
22
33 define i32 @div32(i32 %a, i32 %b) {
44 entry:
1515 define i64 @div64(i64 %a, i64 %b) {
1616 entry:
1717 ; DIV32-LABEL: div64:
18 ; DIV32-NOT: divw
18 ; DIV32-NOT: divl
1919 ; DIV64-LABEL: div64:
20 ; DIV64: orq %{{.*}}, [[REG:%[a-z]+]]
21 ; DIV64: testq $-65536, [[REG]]
22 ; DIV64: divw
20 ; DIV64-DAG: movabsq $-4294967296, [[REGMSK:%[a-z]+]]
21 ; DIV64-DAG: orq %{{.*}}, [[REG:%[a-z]+]]
22 ; DIV64: testq [[REGMSK]], [[REG]]
23 ; DIV64: divl
2324 %div = sdiv i64 %a, %b
2425 ret i64 %div
2526 }