llvm.org GIT mirror llvm / 1e84bc6
[AggressiveInstCombine] convert a chain of 'and-shift' bits into masked compare This is a follow-up to D45986. As suggested there, we should match the "all-bits-set" pattern in addition to "any-bits-set". This was a little more complicated than I thought it would be initially because the "and 1" instruction can be anywhere in the chain. Hopefully, the code comments make that logic understandable, but if you see a way to simplify or improve that, it's most appreciated. This transforms patterns that emerge from bitfield tests as seen in PR37098: https://bugs.llvm.org/show_bug.cgi?id=37098 I think it would also help reduce the large test from: D46336 D46595 but we need something to reassociate that case to the forms we're expecting here first. Differential Revision: https://reviews.llvm.org/D46649 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@331937 91177308-0d34-0410-b5e6-96231b3b80d8 Sanjay Patel 1 year, 4 months ago
3 changed file(s) with 243 addition(s) and 62 deletion(s). Raw diff Collapse all Expand all
5656 };
5757 } // namespace
5858
59 /// This is a recursive helper for 'and X, 1' that walks through a chain of 'or'
60 /// instructions looking for shift ops of a common source value (first member of
61 /// the pair). The second member of the pair is a mask constant for all of the
62 /// bits that are being compared. So this:
63 /// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
64 /// returns {X, 0x129} and those are the operands of an 'and' that is compared
65 /// to zero.
66 static bool matchMaskedCmpOp(Value *V, std::pair &Result) {
67 // Recurse through a chain of 'or' operands.
59 /// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
60 /// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
61 /// of 'and' ops, then we also need to capture the fact that we saw an
62 /// "and X, 1", so that's an extra return value for that case.
63 struct MaskOps {
64 Value *Root;
65 APInt Mask;
66 bool MatchAndChain;
67 bool FoundAnd1;
68
69 MaskOps(unsigned BitWidth, bool MatchAnds) :
70 Root(nullptr), Mask(APInt::getNullValue(BitWidth)),
71 MatchAndChain(MatchAnds), FoundAnd1(false) {}
72 };
73
74 /// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
75 /// chain of 'and' or 'or' instructions looking for shift ops of a common source
76 /// value. Examples:
77 /// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
78 /// returns { X, 0x129 }
79 /// and (and (X >> 1), 1), (X >> 4)
80 /// returns { X, 0x12 }
81 static bool matchAndOrChain(Value *V, MaskOps &MOps) {
6882 Value *Op0, *Op1;
69 if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
70 return matchMaskedCmpOp(Op0, Result) && matchMaskedCmpOp(Op1, Result);
83 if (MOps.MatchAndChain) {
84 // Recurse through a chain of 'and' operands. This requires an extra check
85 // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
86 // in the chain to know that all of the high bits are cleared.
87 if (match(V, m_And(m_Value(Op0), m_One()))) {
88 MOps.FoundAnd1 = true;
89 return matchAndOrChain(Op0, MOps);
90 }
91 if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
92 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
93 } else {
94 // Recurse through a chain of 'or' operands.
95 if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
96 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
97 }
7198
7299 // We need a shift-right or a bare value representing a compare of bit 0 of
73100 // the original source operand.
77104 Candidate = V;
78105
79106 // Initialize result source operand.
80 if (!Result.first)
81 Result.first = Candidate;
107 if (!MOps.Root)
108 MOps.Root = Candidate;
82109
83110 // Fill in the mask bit derived from the shift constant.
84 Result.second.setBit(BitIndex);
85 return Result.first == Candidate;
86 }
87
88 /// Match an 'and' of a chain of or-shifted bits from a common source value into
89 /// a masked compare:
90 /// and (or (lshr X, C), ...), 1 --> (X & C') != 0
91 static bool foldToMaskedCmp(Instruction &I) {
92 // TODO: This is only looking for 'any-bits-set' and 'all-bits-clear'.
93 // We should also match 'all-bits-set' and 'any-bits-clear' by looking for a
94 // a chain of 'and'.
95 if (!match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))
111 MOps.Mask.setBit(BitIndex);
112 return MOps.Root == Candidate;
113 }
114
115 /// Match patterns that correspond to "any-bits-set" and "all-bits-set".
116 /// These will include a chain of 'or' or 'and'-shifted bits from a
117 /// common source value:
118 /// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
119 /// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
120 /// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
121 /// that differ only with a final 'not' of the result. We expect that final
122 /// 'not' to be folded with the compare that we create here (invert predicate).
123 static bool foldAnyOrAllBitsSet(Instruction &I) {
124 // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
125 // final "and X, 1" instruction must be the final op in the sequence.
126 bool MatchAllBitsSet;
127 if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value())))
128 MatchAllBitsSet = true;
129 else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))
130 MatchAllBitsSet = false;
131 else
96132 return false;
97133
98 std::pair
99 MaskOps(nullptr, APInt::getNullValue(I.getType()->getScalarSizeInBits()));
100 if (!matchMaskedCmpOp(cast(&I)->getOperand(0), MaskOps))
101 return false;
102
134 MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet);
135 if (MatchAllBitsSet) {
136 if (!matchAndOrChain(cast(&I), MOps) || !MOps.FoundAnd1)
137 return false;
138 } else {
139 if (!matchAndOrChain(cast(&I)->getOperand(0), MOps))
140 return false;
141 }
142
143 // The pattern was found. Create a masked compare that replaces all of the
144 // shift and logic ops.
103145 IRBuilder<> Builder(&I);
104 Value *Mask = Builder.CreateAnd(MaskOps.first, MaskOps.second);
105 Value *CmpZero = Builder.CreateIsNotNull(Mask);
106 Value *Zext = Builder.CreateZExt(CmpZero, I.getType());
146 Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask);
147 Value *And = Builder.CreateAnd(MOps.Root, Mask);
148 Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask) :
149 Builder.CreateIsNotNull(And);
150 Value *Zext = Builder.CreateZExt(Cmp, I.getType());
107151 I.replaceAllUsesWith(Zext);
108152 return true;
109153 }
118162 if (!DT.isReachableFromEntry(&BB))
119163 continue;
120164 // Do not delete instructions under here and invalidate the iterator.
121 for (Instruction &I : BB)
122 MadeChange |= foldToMaskedCmp(I);
165 // Walk the block backwards for efficiency. We're matching a chain of
166 // use->defs, so we're more likely to succeed by starting from the bottom.
167 // Also, we want to avoid matching partial patterns.
168 // TODO: It would be more efficient if we removed dead instructions
169 // iteratively in this loop rather than waiting until the end.
170 for (Instruction &I : make_range(BB.rbegin(), BB.rend()))
171 MadeChange |= foldAnyOrAllBitsSet(I);
123172 }
124173
125174 // We're done with transforms, so remove dead instructions.
5050 ret i32 %r
5151 }
5252
53 ; TODO: Recognize the 'and' sibling pattern. The 'and 1' may not be at the end.
53 ; Recognize the 'and' sibling pattern (all-bits-set). The 'and 1' may not be at the end.
54
55 define i32 @allset_two_bit_mask(i32 %x) {
56 ; CHECK-LABEL: @allset_two_bit_mask(
57 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], 129
58 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 129
59 ; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
60 ; CHECK-NEXT: ret i32 [[TMP3]]
61 ;
62 %s = lshr i32 %x, 7
63 %o = and i32 %s, %x
64 %r = and i32 %o, 1
65 ret i32 %r
66 }
5467
5568 define i64 @allset_four_bit_mask(i64 %x) {
5669 ; CHECK-LABEL: @allset_four_bit_mask(
57 ; CHECK-NEXT: [[T1:%.*]] = lshr i64 [[X:%.*]], 1
58 ; CHECK-NEXT: [[T2:%.*]] = lshr i64 [[X]], 2
59 ; CHECK-NEXT: [[T3:%.*]] = lshr i64 [[X]], 3
60 ; CHECK-NEXT: [[T4:%.*]] = lshr i64 [[X]], 4
61 ; CHECK-NEXT: [[A1:%.*]] = and i64 [[T4]], 1
62 ; CHECK-NEXT: [[A2:%.*]] = and i64 [[T2]], [[A1]]
63 ; CHECK-NEXT: [[A3:%.*]] = and i64 [[A2]], [[T1]]
64 ; CHECK-NEXT: [[R:%.*]] = and i64 [[A3]], [[T3]]
65 ; CHECK-NEXT: ret i64 [[R]]
70 ; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 30
71 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 30
72 ; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i64
73 ; CHECK-NEXT: ret i64 [[TMP3]]
6674 ;
6775 %t1 = lshr i64 %x, 1
6876 %t2 = lshr i64 %x, 2
7583 ret i64 %r
7684 }
7785
86 declare void @use(i32)
87
88 ; negative test - extra use means the transform would increase instruction count
89
90 define i32 @allset_two_bit_mask_multiuse(i32 %x) {
91 ; CHECK-LABEL: @allset_two_bit_mask_multiuse(
92 ; CHECK-NEXT: [[S:%.*]] = lshr i32 [[X:%.*]], 7
93 ; CHECK-NEXT: [[O:%.*]] = and i32 [[S]], [[X]]
94 ; CHECK-NEXT: [[R:%.*]] = and i32 [[O]], 1
95 ; CHECK-NEXT: call void @use(i32 [[O]])
96 ; CHECK-NEXT: ret i32 [[R]]
97 ;
98 %s = lshr i32 %x, 7
99 %o = and i32 %s, %x
100 %r = and i32 %o, 1
101 call void @use(i32 %o)
102 ret i32 %r
103 }
104
105 ; negative test - missing 'and 1' mask, so more than the low bit is used here
106
107 define i8 @allset_three_bit_mask_no_and1(i8 %x) {
108 ; CHECK-LABEL: @allset_three_bit_mask_no_and1(
109 ; CHECK-NEXT: [[T1:%.*]] = lshr i8 [[X:%.*]], 1
110 ; CHECK-NEXT: [[T2:%.*]] = lshr i8 [[X]], 2
111 ; CHECK-NEXT: [[T3:%.*]] = lshr i8 [[X]], 3
112 ; CHECK-NEXT: [[A2:%.*]] = and i8 [[T1]], [[T2]]
113 ; CHECK-NEXT: [[R:%.*]] = and i8 [[A2]], [[T3]]
114 ; CHECK-NEXT: ret i8 [[R]]
115 ;
116 %t1 = lshr i8 %x, 1
117 %t2 = lshr i8 %x, 2
118 %t3 = lshr i8 %x, 3
119 %a2 = and i8 %t1, %t2
120 %r = and i8 %a2, %t3
121 ret i8 %r
122 }
123
124 ; This test demonstrates that the transform can be large. If the implementation
125 ; is slow or explosive (stack overflow due to recursion), it should be made efficient.
126
127 define i64 @allset_40_bit_mask(i64 %x) {
128 ; CHECK-LABEL: @allset_40_bit_mask(
129 ; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 2199023255550
130 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2199023255550
131 ; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i64
132 ; CHECK-NEXT: ret i64 [[TMP3]]
133 ;
134 %t1 = lshr i64 %x, 1
135 %t2 = lshr i64 %x, 2
136 %t3 = lshr i64 %x, 3
137 %t4 = lshr i64 %x, 4
138 %t5 = lshr i64 %x, 5
139 %t6 = lshr i64 %x, 6
140 %t7 = lshr i64 %x, 7
141 %t8 = lshr i64 %x, 8
142 %t9 = lshr i64 %x, 9
143 %t10 = lshr i64 %x, 10
144 %t11 = lshr i64 %x, 11
145 %t12 = lshr i64 %x, 12
146 %t13 = lshr i64 %x, 13
147 %t14 = lshr i64 %x, 14
148 %t15 = lshr i64 %x, 15
149 %t16 = lshr i64 %x, 16
150 %t17 = lshr i64 %x, 17
151 %t18 = lshr i64 %x, 18
152 %t19 = lshr i64 %x, 19
153 %t20 = lshr i64 %x, 20
154 %t21 = lshr i64 %x, 21
155 %t22 = lshr i64 %x, 22
156 %t23 = lshr i64 %x, 23
157 %t24 = lshr i64 %x, 24
158 %t25 = lshr i64 %x, 25
159 %t26 = lshr i64 %x, 26
160 %t27 = lshr i64 %x, 27
161 %t28 = lshr i64 %x, 28
162 %t29 = lshr i64 %x, 29
163 %t30 = lshr i64 %x, 30
164 %t31 = lshr i64 %x, 31
165 %t32 = lshr i64 %x, 32
166 %t33 = lshr i64 %x, 33
167 %t34 = lshr i64 %x, 34
168 %t35 = lshr i64 %x, 35
169 %t36 = lshr i64 %x, 36
170 %t37 = lshr i64 %x, 37
171 %t38 = lshr i64 %x, 38
172 %t39 = lshr i64 %x, 39
173 %t40 = lshr i64 %x, 40
174
175 %a1 = and i64 %t1, 1
176 %a2 = and i64 %t2, %a1
177 %a3 = and i64 %t3, %a2
178 %a4 = and i64 %t4, %a3
179 %a5 = and i64 %t5, %a4
180 %a6 = and i64 %t6, %a5
181 %a7 = and i64 %t7, %a6
182 %a8 = and i64 %t8, %a7
183 %a9 = and i64 %t9, %a8
184 %a10 = and i64 %t10, %a9
185 %a11 = and i64 %t11, %a10
186 %a12 = and i64 %t12, %a11
187 %a13 = and i64 %t13, %a12
188 %a14 = and i64 %t14, %a13
189 %a15 = and i64 %t15, %a14
190 %a16 = and i64 %t16, %a15
191 %a17 = and i64 %t17, %a16
192 %a18 = and i64 %t18, %a17
193 %a19 = and i64 %t19, %a18
194 %a20 = and i64 %t20, %a19
195 %a21 = and i64 %t21, %a20
196 %a22 = and i64 %t22, %a21
197 %a23 = and i64 %t23, %a22
198 %a24 = and i64 %t24, %a23
199 %a25 = and i64 %t25, %a24
200 %a26 = and i64 %t26, %a25
201 %a27 = and i64 %t27, %a26
202 %a28 = and i64 %t28, %a27
203 %a29 = and i64 %t29, %a28
204 %a30 = and i64 %t30, %a29
205 %a31 = and i64 %t31, %a30
206 %a32 = and i64 %t32, %a31
207 %a33 = and i64 %t33, %a32
208 %a34 = and i64 %t34, %a33
209 %a35 = and i64 %t35, %a34
210 %a36 = and i64 %t36, %a35
211 %a37 = and i64 %t37, %a36
212 %a38 = and i64 %t38, %a37
213 %a39 = and i64 %t39, %a38
214 %a40 = and i64 %t40, %a39
215
216 ret i64 %a40
217 }
218
7575
7676 define i32 @allset(i32 %a) {
7777 ; CHECK-LABEL: @allset(
78 ; CHECK-NEXT: [[BF_LSHR:%.*]] = lshr i32 [[A:%.*]], 1
79 ; CHECK-NEXT: [[BF_LSHR5:%.*]] = lshr i32 [[A]], 2
80 ; CHECK-NEXT: [[BF_LSHR10:%.*]] = lshr i32 [[A]], 3
81 ; CHECK-NEXT: [[BF_CLEAR2:%.*]] = and i32 [[A]], 1
82 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[BF_CLEAR2]], [[BF_LSHR]]
83 ; CHECK-NEXT: [[AND8:%.*]] = and i32 [[AND]], [[BF_LSHR5]]
84 ; CHECK-NEXT: [[AND13:%.*]] = and i32 [[AND8]], [[BF_LSHR10]]
85 ; CHECK-NEXT: ret i32 [[AND13]]
78 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 15
79 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 15
80 ; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
81 ; CHECK-NEXT: ret i32 [[TMP3]]
8682 ;
8783 %a.sroa.0.0.trunc = trunc i32 %a to i8
8884 %a.sroa.5.0.shift = lshr i32 %a, 8
109105
110106 define i32 @anyclear(i32 %a) {
111107 ; CHECK-LABEL: @anyclear(
112 ; CHECK-NEXT: [[BF_LSHR:%.*]] = lshr i32 [[A:%.*]], 1
113 ; CHECK-NEXT: [[BF_LSHR5:%.*]] = lshr i32 [[A]], 2
114 ; CHECK-NEXT: [[BF_LSHR10:%.*]] = lshr i32 [[A]], 3
115 ; CHECK-NEXT: [[BF_CLEAR2:%.*]] = and i32 [[A]], 1
116 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[BF_CLEAR2]], [[BF_LSHR]]
117 ; CHECK-NEXT: [[AND8:%.*]] = and i32 [[AND]], [[BF_LSHR5]]
118 ; CHECK-NEXT: [[AND13:%.*]] = and i32 [[AND8]], [[BF_LSHR10]]
119 ; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[AND13]], 1
120 ; CHECK-NEXT: ret i32 [[TMP1]]
108 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 15
109 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 15
110 ; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
111 ; CHECK-NEXT: ret i32 [[TMP3]]
121112 ;
122113 %a.sroa.0.0.trunc = trunc i32 %a to i8
123114 %a.sroa.5.0.shift = lshr i32 %a, 8