llvm.org GIT mirror llvm / 6393b3a
Remove spurious mask operations from AArch64 add->compares on 16 and 8 bit values This patch checks for DAG patterns that are an add or a sub followed by a compare on 16 and 8 bit inputs. Since AArch64 does not support those types natively they are legalized into 32 bit values, which means that mask operations are inserted into the DAG to emulate overflow behaviour. In many cases those masks do not change the result of the processing and just introduce a dependent operation, often in the middle of a hot loop. This patch detects the relevent DAG patterns and then tests to see if the transforms are equivalent with and without the mask, removing the mask if possible. The exact mechanism of this patch was discusses in http://lists.cs.uiuc.edu/pipermail/llvmdev/2014-July/074444.html There is a reasonably good chance there are missed oppurtunities due to similiar (but not identical) DAG patterns that could be funneled into this test, adding them should be simple if we see test cases. Tests included. rdar://13754426 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@216776 91177308-0d34-0410-b5e6-96231b3b80d8 Louis Gerbarg 6 years ago
2 changed file(s) with 532 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
79027902 return SDValue();
79037903 }
79047904
7905 // Checks to see if the value is the prescribed width and returns information
7906 // about its extension mode.
7907 static
7908 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
7909 ExtType = ISD::NON_EXTLOAD;
7910 switch(V.getNode()->getOpcode()) {
7911 default:
7912 return false;
7913 case ISD::LOAD: {
7914 LoadSDNode *LoadNode = cast(V.getNode());
7915 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
7916 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
7917 ExtType = LoadNode->getExtensionType();
7918 return true;
7919 }
7920 return false;
7921 }
7922 case ISD::AssertSext: {
7923 VTSDNode *TypeNode = cast(V.getNode()->getOperand(1));
7924 if ((TypeNode->getVT() == MVT::i8 && width == 8)
7925 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
7926 ExtType = ISD::SEXTLOAD;
7927 return true;
7928 }
7929 return false;
7930 }
7931 case ISD::AssertZext: {
7932 VTSDNode *TypeNode = cast(V.getNode()->getOperand(1));
7933 if ((TypeNode->getVT() == MVT::i8 && width == 8)
7934 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
7935 ExtType = ISD::ZEXTLOAD;
7936 return true;
7937 }
7938 return false;
7939 }
7940 case ISD::Constant:
7941 case ISD::TargetConstant: {
7942 if(abs(cast(V.getNode())->getSExtValue()) < 1<<(width-1))
7943 return true;
7944 return false;
7945 }
7946 }
7947
7948 return true;
7949 }
7950
7951 // This function does a whole lot of voodoo to determine if the tests are
7952 // equivalent without and with a mask. Essentially what happens is that given a
7953 // DAG resembling:
7954 //
7955 // +-------------+ +-------------+ +-------------+ +-------------+
7956 // | Input | | AddConstant | | CompConstant| | CC |
7957 // +-------------+ +-------------+ +-------------+ +-------------+
7958 // | | | |
7959 // V V | +----------+
7960 // +-------------+ +----+ | |
7961 // | ADD | |0xff| | |
7962 // +-------------+ +----+ | |
7963 // | | | |
7964 // V V | |
7965 // +-------------+ | |
7966 // | AND | | |
7967 // +-------------+ | |
7968 // | | |
7969 // +-----+ | |
7970 // | | |
7971 // V V V
7972 // +-------------+
7973 // | CMP |
7974 // +-------------+
7975 //
7976 // The AND node may be safely removed for some combinations of inputs. In
7977 // particular we need to take into account the extension type of the Input,
7978 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
7979 // width of the input (this can work for any width inputs, the above graph is
7980 // specific to 8 bits.
7981 //
7982 // The specific equations were worked out by generating output tables for each
7983 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
7984 // problem was simplified by working with 4 bit inputs, which means we only
7985 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
7986 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
7987 // patterns present in both extensions (0,7). For every distinct set of
7988 // AddConstant and CompConstants bit patterns we can consider the masked and
7989 // unmasked versions to be equivalent if the result of this function is true for
7990 // all 16 distinct bit patterns of for the current extension type of Input (w0).
7991 //
7992 // sub w8, w0, w1
7993 // and w10, w8, #0x0f
7994 // cmp w8, w2
7995 // cset w9, AArch64CC
7996 // cmp w10, w2
7997 // cset w11, AArch64CC
7998 // cmp w9, w11
7999 // cset w0, eq
8000 // ret
8001 //
8002 // Since the above function shows when the outputs are equivalent it defines
8003 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
8004 // would be expensive to run during compiles. The equations below were written
8005 // in a test harness that confirmed they gave equivalent outputs to the above
8006 // for all inputs function, so they can be used determine if the removal is
8007 // legal instead.
8008 //
8009 // isEquivalentMaskless() is the code for testing if the AND can be removed
8010 // factored out of the DAG recognition as the DAG can take several forms.
8011
8012 static
8013 bool isEquivalentMaskless(unsigned CC, unsigned width,
8014 ISD::LoadExtType ExtType, signed AddConstant,
8015 signed CompConstant) {
8016 // By being careful about our equations and only writing the in term
8017 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
8018 // make them generally applicable to all bit widths.
8019 signed MaxUInt = (1 << width);
8020
8021 // For the purposes of these comparisons sign extending the type is
8022 // equivalent to zero extending the add and displacing it by half the integer
8023 // width. Provided we are careful and make sure our equations are valid over
8024 // the whole range we can just adjust the input and avoid writing equations
8025 // for sign extended inputs.
8026 if (ExtType == ISD::SEXTLOAD)
8027 AddConstant -= (1 << (width-1));
8028
8029 switch(CC) {
8030 case AArch64CC::LE:
8031 case AArch64CC::GT: {
8032 if ((AddConstant == 0) ||
8033 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
8034 (AddConstant >= 0 && CompConstant < 0) ||
8035 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
8036 return true;
8037 } break;
8038 case AArch64CC::LT:
8039 case AArch64CC::GE: {
8040 if ((AddConstant == 0) ||
8041 (AddConstant >= 0 && CompConstant <= 0) ||
8042 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
8043 return true;
8044 } break;
8045 case AArch64CC::HI:
8046 case AArch64CC::LS: {
8047 if ((AddConstant >= 0 && CompConstant < 0) ||
8048 (AddConstant <= 0 && CompConstant >= -1 &&
8049 CompConstant < AddConstant + MaxUInt))
8050 return true;
8051 } break;
8052 case AArch64CC::PL:
8053 case AArch64CC::MI: {
8054 if ((AddConstant == 0) ||
8055 (AddConstant > 0 && CompConstant <= 0) ||
8056 (AddConstant < 0 && CompConstant <= AddConstant))
8057 return true;
8058 } break;
8059 case AArch64CC::LO:
8060 case AArch64CC::HS: {
8061 if ((AddConstant >= 0 && CompConstant <= 0) ||
8062 (AddConstant <= 0 && CompConstant >= 0 &&
8063 CompConstant <= AddConstant + MaxUInt))
8064 return true;
8065 } break;
8066 case AArch64CC::EQ:
8067 case AArch64CC::NE: {
8068 if ((AddConstant > 0 && CompConstant < 0) ||
8069 (AddConstant < 0 && CompConstant >= 0 &&
8070 CompConstant < AddConstant + MaxUInt) ||
8071 (AddConstant >= 0 && CompConstant >= 0 &&
8072 CompConstant >= AddConstant) ||
8073 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
8074
8075 return true;
8076 } break;
8077 case AArch64CC::VS:
8078 case AArch64CC::VC:
8079 case AArch64CC::AL:
8080 case AArch64CC::NV:
8081 return true;
8082 case AArch64CC::Invalid:
8083 break;
8084 }
8085
8086 return false;
8087 }
8088
8089 static
8090 SDValue performCONDCombine(SDNode *N,
8091 TargetLowering::DAGCombinerInfo &DCI,
8092 SelectionDAG &DAG, unsigned CCIndex,
8093 unsigned CmpIndex) {
8094 unsigned CC = cast(N->getOperand(CCIndex))->getSExtValue();
8095 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
8096 unsigned CondOpcode = SubsNode->getOpcode();
8097
8098 if (CondOpcode != AArch64ISD::SUBS)
8099 return SDValue();
8100
8101 // There is a SUBS feeding this condition. Is it fed by a mask we can
8102 // use?
8103
8104 SDNode *AndNode = SubsNode->getOperand(0).getNode();
8105 unsigned MaskBits = 0;
8106
8107 if (AndNode->getOpcode() != ISD::AND)
8108 return SDValue();
8109
8110 if (ConstantSDNode *CN = dyn_cast(AndNode->getOperand(1))) {
8111 uint32_t CNV = CN->getZExtValue();
8112 if (CNV == 255)
8113 MaskBits = 8;
8114 else if (CNV == 65535)
8115 MaskBits = 16;
8116 }
8117
8118 if (!MaskBits)
8119 return SDValue();
8120
8121 SDValue AddValue = AndNode->getOperand(0);
8122
8123 if (AddValue.getOpcode() != ISD::ADD)
8124 return SDValue();
8125
8126 // The basic dag structure is correct, grab the inputs and validate them.
8127
8128 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
8129 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
8130 SDValue SubsInputValue = SubsNode->getOperand(1);
8131
8132 // The mask is present and the provenance of all the values is a smaller type,
8133 // lets see if the mask is superfluous.
8134
8135 if (!isa(AddInputValue2.getNode()) ||
8136 !isa(SubsInputValue.getNode()))
8137 return SDValue();
8138
8139 ISD::LoadExtType ExtType;
8140
8141 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
8142 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
8143 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
8144 return SDValue();
8145
8146 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
8147 cast(AddInputValue2.getNode())->getSExtValue(),
8148 cast(SubsInputValue.getNode())->getSExtValue()))
8149 return SDValue();
8150
8151 // The AND is not necessary, remove it.
8152
8153 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
8154 SubsNode->getValueType(1));
8155 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
8156
8157 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
8158 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
8159
8160 return SDValue(N, 0);
8161 }
8162
79058163 // Optimize compare with zero and branch.
79068164 static SDValue performBRCONDCombine(SDNode *N,
79078165 TargetLowering::DAGCombinerInfo &DCI,
79088166 SelectionDAG &DAG) {
8167 SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
8168 if (NV.getNode())
8169 N = NV.getNode();
79098170 SDValue Chain = N->getOperand(0);
79108171 SDValue Dest = N->getOperand(1);
79118172 SDValue CCVal = N->getOperand(2);
80628323 return performSTORECombine(N, DCI, DAG, Subtarget);
80638324 case AArch64ISD::BRCOND:
80648325 return performBRCONDCombine(N, DCI, DAG);
8326 case AArch64ISD::CSEL:
8327 return performCONDCombine(N, DCI, DAG, 2, 3);
80658328 case AArch64ISD::DUP:
80668329 return performPostLD1Combine(N, DCI, false);
80678330 case ISD::INSERT_VECTOR_ELT:
0 ; RUN: llc -O0 -fast-isel=false -mtriple=arm64-apple-darwin < %s | FileCheck %s
1
2 @board = common global [400 x i8] zeroinitializer, align 1
3 @next_string = common global i32 0, align 4
4 @string_number = common global [400 x i32] zeroinitializer, align 4
5
6 ; Function Attrs: nounwind ssp
7 define void @new_position(i32 %pos) {
8 entry:
9 %idxprom = sext i32 %pos to i64
10 %arrayidx = getelementptr inbounds [400 x i8]* @board, i64 0, i64 %idxprom
11 %tmp = load i8* %arrayidx, align 1
12 %.off = add i8 %tmp, -1
13 %switch = icmp ult i8 %.off, 2
14 br i1 %switch, label %if.then, label %if.end
15
16 if.then: ; preds = %entry
17 %tmp1 = load i32* @next_string, align 4
18 %arrayidx8 = getelementptr inbounds [400 x i32]* @string_number, i64 0, i64 %idxprom
19 store i32 %tmp1, i32* %arrayidx8, align 4
20 br label %if.end
21
22 if.end: ; preds = %if.then, %entry
23 ret void
24 ; CHECK-LABEL: new_position
25 ; CHECK-NOT: and
26 ; CHECK: ret
27 }
28
29 define zeroext i1 @test8_0(i8 zeroext %x) align 2 {
30 entry:
31 %0 = add i8 %x, 74
32 %1 = icmp ult i8 %0, -20
33 br i1 %1, label %ret_true, label %ret_false
34 ret_false:
35 ret i1 false
36 ret_true:
37 ret i1 true
38 ; CHECK-LABEL: test8_0
39 ; CHECK: and
40 ; CHECK: ret
41 }
42
43 define zeroext i1 @test8_1(i8 zeroext %x) align 2 {
44 entry:
45 %0 = add i8 %x, 246
46 %1 = icmp uge i8 %0, 90
47 br i1 %1, label %ret_true, label %ret_false
48 ret_false:
49 ret i1 false
50 ret_true:
51 ret i1 true
52 ; CHECK-LABEL: test8_1
53 ; CHECK-NOT: and
54 ; CHECK: ret
55 }
56
57 define zeroext i1 @test8_2(i8 zeroext %x) align 2 {
58 entry:
59 %0 = add i8 %x, 227
60 %1 = icmp ne i8 %0, 179
61 br i1 %1, label %ret_true, label %ret_false
62 ret_false:
63 ret i1 false
64 ret_true:
65 ret i1 true
66 ; CHECK-LABEL: test8_2
67 ; CHECK-NOT: and
68 ; CHECK: ret
69 }
70
71 define zeroext i1 @test8_3(i8 zeroext %x) align 2 {
72 entry:
73 %0 = add i8 %x, 201
74 %1 = icmp eq i8 %0, 154
75 br i1 %1, label %ret_true, label %ret_false
76 ret_false:
77 ret i1 false
78 ret_true:
79 ret i1 true
80 ; CHECK-LABEL: test8_3
81 ; CHECK-NOT: and
82 ; CHECK: ret
83 }
84
85 define zeroext i1 @test8_4(i8 zeroext %x) align 2 {
86 entry:
87 %0 = add i8 %x, -79
88 %1 = icmp ne i8 %0, -40
89 br i1 %1, label %ret_true, label %ret_false
90 ret_false:
91 ret i1 false
92 ret_true:
93 ret i1 true
94 ; CHECK-LABEL: test8_4
95 ; CHECK-NOT: and
96 ; CHECK: ret
97 }
98
99 define zeroext i1 @test8_5(i8 zeroext %x) align 2 {
100 entry:
101 %0 = add i8 %x, 133
102 %1 = icmp uge i8 %0, -105
103 br i1 %1, label %ret_true, label %ret_false
104 ret_false:
105 ret i1 false
106 ret_true:
107 ret i1 true
108 ; CHECK-LABEL: test8_5
109 ; CHECK: and
110 ; CHECK: ret
111 }
112
113 define zeroext i1 @test8_6(i8 zeroext %x) align 2 {
114 entry:
115 %0 = add i8 %x, -58
116 %1 = icmp uge i8 %0, 155
117 br i1 %1, label %ret_true, label %ret_false
118 ret_false:
119 ret i1 false
120 ret_true:
121 ret i1 true
122 ; CHECK-LABEL: test8_6
123 ; CHECK: and
124 ; CHECK: ret
125 }
126
127 define zeroext i1 @test8_7(i8 zeroext %x) align 2 {
128 entry:
129 %0 = add i8 %x, 225
130 %1 = icmp ult i8 %0, 124
131 br i1 %1, label %ret_true, label %ret_false
132 ret_false:
133 ret i1 false
134 ret_true:
135 ret i1 true
136 ; CHECK-LABEL: test8_7
137 ; CHECK-NOT: and
138 ; CHECK: ret
139 }
140
141
142
143 define zeroext i1 @test8_8(i8 zeroext %x) align 2 {
144 entry:
145 %0 = add i8 %x, 190
146 %1 = icmp uge i8 %0, 1
147 br i1 %1, label %ret_true, label %ret_false
148 ret_false:
149 ret i1 false
150 ret_true:
151 ret i1 true
152 ; CHECK-LABEL: test8_8
153 ; CHECK-NOT: and
154 ; CHECK: ret
155 }
156
157 define zeroext i1 @test16_0(i16 zeroext %x) align 2 {
158 entry:
159 %0 = add i16 %x, -46989
160 %1 = icmp ne i16 %0, -41903
161 br i1 %1, label %ret_true, label %ret_false
162 ret_false:
163 ret i1 false
164 ret_true:
165 ret i1 true
166 ; CHECK-LABEL: test16_0
167 ; CHECK-NOT: and
168 ; CHECK: ret
169 }
170
171 define zeroext i1 @test16_2(i16 zeroext %x) align 2 {
172 entry:
173 %0 = add i16 %x, 16882
174 %1 = icmp ule i16 %0, -24837
175 br i1 %1, label %ret_true, label %ret_false
176 ret_false:
177 ret i1 false
178 ret_true:
179 ret i1 true
180 ; CHECK-LABEL: test16_2
181 ; CHECK: and
182 ; CHECK: ret
183 }
184
185 define zeroext i1 @test16_3(i16 zeroext %x) align 2 {
186 entry:
187 %0 = add i16 %x, 29283
188 %1 = icmp ne i16 %0, 16947
189 br i1 %1, label %ret_true, label %ret_false
190 ret_false:
191 ret i1 false
192 ret_true:
193 ret i1 true
194 ; CHECK-LABEL: test16_3
195 ; CHECK-NOT: and
196 ; CHECK: ret
197 }
198
199 define zeroext i1 @test16_4(i16 zeroext %x) align 2 {
200 entry:
201 %0 = add i16 %x, -35551
202 %1 = icmp uge i16 %0, 15677
203 br i1 %1, label %ret_true, label %ret_false
204 ret_false:
205 ret i1 false
206 ret_true:
207 ret i1 true
208 ; CHECK-LABEL: test16_4
209 ; CHECK: and
210 ; CHECK: ret
211 }
212
213 define zeroext i1 @test16_5(i16 zeroext %x) align 2 {
214 entry:
215 %0 = add i16 %x, -25214
216 %1 = icmp ne i16 %0, -1932
217 br i1 %1, label %ret_true, label %ret_false
218 ret_false:
219 ret i1 false
220 ret_true:
221 ret i1 true
222 ; CHECK-LABEL: test16_5
223 ; CHECK-NOT: and
224 ; CHECK: ret
225 }
226
227 define zeroext i1 @test16_6(i16 zeroext %x) align 2 {
228 entry:
229 %0 = add i16 %x, -32194
230 %1 = icmp uge i16 %0, -41215
231 br i1 %1, label %ret_true, label %ret_false
232 ret_false:
233 ret i1 false
234 ret_true:
235 ret i1 true
236 ; CHECK-LABEL: test16_6
237 ; CHECK-NOT: and
238 ; CHECK: ret
239 }
240
241 define zeroext i1 @test16_7(i16 zeroext %x) align 2 {
242 entry:
243 %0 = add i16 %x, 9272
244 %1 = icmp uge i16 %0, -42916
245 br i1 %1, label %ret_true, label %ret_false
246 ret_false:
247 ret i1 false
248 ret_true:
249 ret i1 true
250 ; CHECK-LABEL: test16_7
251 ; CHECK: and
252 ; CHECK: ret
253 }
254
255 define zeroext i1 @test16_8(i16 zeroext %x) align 2 {
256 entry:
257 %0 = add i16 %x, -63749
258 %1 = icmp ne i16 %0, 6706
259 br i1 %1, label %ret_true, label %ret_false
260 ret_false:
261 ret i1 false
262 ret_true:
263 ret i1 true
264 ; CHECK-LABEL: test16_8
265 ; CHECK-NOT: and
266 ; CHECK: ret
267 }
268