llvm.org GIT mirror llvm / c22ac1d
Use trigrams to speed up SpecialCaseList. Summary: it's often the case when the rules in the SpecialCaseList are of the form hel.o*bar. That gives us a chance to build trigram index to quickly discard 99% of inputs without running a full regex. A similar idea was used in Google Code Search as described in the blog post: https://swtch.com/~rsc/regexp/regexp4.html The check is defeated, if there's at least one regex more complicated than that. In this case, all inputs will go through the regex. That said, the real-world rules are often simple or can be simplied. That considerably speeds up compiling Chromium with CFI and UBSan. As measured on Chromium's content_message_generator.cc: before, CFI: 44 s after, CFI: 23 s after, CFI, no blacklist: 23 s (~1% slower, but 3 runs were unable to show the difference) after, regular compilation to bitcode: 23 s Reviewers: pcc Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D27188 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288303 91177308-0d34-0410-b5e6-96231b3b80d8 Ivan Krasin 4 years ago
7 changed file(s) with 336 addition(s) and 2 deletion(s). Raw diff Collapse all Expand all
0 //===-- TrigramIndex.h - a heuristic for SpecialCaseList --------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //===----------------------------------------------------------------------===//
7 //
8 // TrigramIndex implements a heuristic for SpecialCaseList that allows to
9 // filter out ~99% incoming queries when all regular expressions in the
10 // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more
11 // complicated, the check is defeated and it will always pass the queries to a
12 // full regex.
13 //
14 // The basic idea is that in order for a wildcard to match a query, the query
15 // needs to have all trigrams which occur in the wildcard. We create a trigram
16 // index (trigram -> list of rules with it) and then count trigrams in the query
17 // for each rule. If the count for one of the rules reaches the expected value,
18 // the check passes the query to a regex. If none of the rules got enough
19 // trigrams, the check tells that the query is definitely not matched by any
20 // of the rules, and no regex matching is needed.
21 // A similar idea was used in Google Code Search as described in the blog post:
22 // https://swtch.com/~rsc/regexp/regexp4.html
23 //
24 //===----------------------------------------------------------------------===//
25
26 #ifndef LLVM_SUPPORT_TRIGRAMINDEX_H
27 #define LLVM_SUPPORT_TRIGRAMINDEX_H
28
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/ADT/StringMap.h"
31
32 #include
33 #include
34 #include
35
36 namespace llvm {
37 class StringRef;
38
39 class TrigramIndex {
40 public:
41 /// Inserts a new Regex into the index.
42 void insert(std::string Regex);
43
44 /// Returns true, if special case list definitely does not have a line
45 /// that matches the query. Returns false, if it's not sure.
46 bool isDefinitelyOut(StringRef Query) const;
47
48 /// Returned true, iff the heuristic is defeated and not useful.
49 /// In this case isDefinitelyOut always returns false.
50 bool isDefeated() { return Defeated; }
51 private:
52 // If true, the rules are too complicated for the check to work, and full
53 // regex matching is needed for every rule.
54 bool Defeated = false;
55 // The minimum number of trigrams which should match for a rule to have a
56 // chance to match the query. The number of elements equals the number of
57 // regex rules in the SpecialCaseList.
58 std::vector Counts;
59 // Index holds a list of rules indices for each trigram. The same indices
60 // are used in Counts to store per-rule limits.
61 // If a trigram is too common (>4 rules with it), we stop tracking it,
62 // which increases the probability for a need to match using regex, but
63 // decreases the costs in the regular case.
64 std::unordered_map> Index{256};
65 };
66
67 } // namespace llvm
68
69 #endif // LLVM_SUPPORT_TRIGRAMINDEX_H
9393 ThreadPool.cpp
9494 Timer.cpp
9595 ToolOutputFile.cpp
96 TrigramIndex.cpp
9697 Triple.cpp
9798 Twine.cpp
9899 Unicode.cpp
1414 //===----------------------------------------------------------------------===//
1515
1616 #include "llvm/Support/SpecialCaseList.h"
17 #include "llvm/Support/TrigramIndex.h"
1718 #include "llvm/ADT/SmallVector.h"
1819 #include "llvm/ADT/StringExtras.h"
1920 #include "llvm/ADT/StringSet.h"
3233 /// literal strings than Regex.
3334 struct SpecialCaseList::Entry {
3435 StringSet<> Strings;
36 TrigramIndex Trigrams;
3537 std::unique_ptr RegEx;
3638
3739 bool match(StringRef Query) const {
38 return Strings.count(Query) || (RegEx && RegEx->match(Query));
40 if (Strings.count(Query))
41 return true;
42 if (Trigrams.isDefinitelyOut(Query))
43 return false;
44 return RegEx && RegEx->match(Query);
3945 }
4046 };
4147
103109 StringRef Category = SplitRegexp.second;
104110
105111 // See if we can store Regexp in Strings.
112 auto &Entry = Entries[Prefix][Category];
106113 if (Regex::isLiteralERE(Regexp)) {
107 Entries[Prefix][Category].Strings.insert(Regexp);
114 Entry.Strings.insert(Regexp);
108115 continue;
109116 }
117 Entry.Trigrams.insert(Regexp);
110118
111119 // Replace * with .*
112120 for (size_t pos = 0; (pos = Regexp.find('*', pos)) != std::string::npos;
0 //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // TrigramIndex implements a heuristic for SpecialCaseList that allows to
10 // filter out ~99% incoming queries when all regular expressions in the
11 // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more
12 // complicated, the check is defeated and it will always pass the queries to a
13 // full regex.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "llvm/Support/TrigramIndex.h"
18 #include "llvm/ADT/SmallVector.h"
19
20 #include
21 #include
22 #include
23
24 using namespace llvm;
25
26 static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}";
27
28 static bool isSimpleWildcard(StringRef Str) {
29 // Check for regex metacharacters other than '*' and '.'.
30 return Str.find_first_of(RegexAdvancedMetachars) == StringRef::npos;
31 }
32
33 void TrigramIndex::insert(std::string Regex) {
34 if (Defeated) return;
35 if (!isSimpleWildcard(Regex)) {
36 Defeated = true;
37 return;
38 }
39
40 std::set Was;
41 unsigned Cnt = 0;
42 unsigned Tri = 0;
43 unsigned Len = 0;
44 for (unsigned Char : Regex) {
45 if (Char == '.' || Char == '*') {
46 Tri = 0;
47 Len = 0;
48 continue;
49 }
50 Tri = ((Tri << 8) + Char) & 0xFFFFFF;
51 Len++;
52 if (Len < 3)
53 continue;
54 // We don't want the index to grow too much for the popular trigrams,
55 // as they are weak signals. It's ok to still require them for the
56 // rules we have already processed. It's just a small additional
57 // computational cost.
58 if (Index[Tri].size() >= 4)
59 continue;
60 Cnt++;
61 if (!Was.count(Tri)) {
62 // Adding the current rule to the index.
63 Index[Tri].push_back(Counts.size());
64 Was.insert(Tri);
65 }
66 }
67 if (!Cnt) {
68 // This rule does not have remarkable trigrams to rely on.
69 // We have to always call the full regex chain.
70 Defeated = true;
71 return;
72 }
73 Counts.push_back(Cnt);
74 }
75
76 bool TrigramIndex::isDefinitelyOut(StringRef Query) const {
77 if (Defeated)
78 return false;
79 std::vector CurCounts(Counts.size());
80 unsigned Tri = 0;
81 for (size_t I = 0; I < Query.size(); I++) {
82 Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF;
83 if (I < 2)
84 continue;
85 const auto &II = Index.find(Tri);
86 if (II == Index.end())
87 continue;
88 for (size_t J : II->second) {
89 CurCounts[J]++;
90 // If we have reached a desired limit, we have to look at the query
91 // more closely by running a full regex.
92 if (CurCounts[J] >= Counts[J])
93 return false;
94 }
95 }
96 return true;
97 }
4747 TimerTest.cpp
4848 TypeNameTest.cpp
4949 TrailingObjectsTest.cpp
50 TrigramIndexTest.cpp
5051 UnicodeTest.cpp
5152 YAMLIOTest.cpp
5253 YAMLParserTest.cpp
133133 sys::fs::remove(Path);
134134 }
135135
136 TEST_F(SpecialCaseListTest, NoTrigramsInRules) {
137 std::unique_ptr SCL = makeSpecialCaseList("fun:b.r\n"
138 "fun:za*az\n");
139 EXPECT_TRUE(SCL->inSection("fun", "bar"));
140 EXPECT_FALSE(SCL->inSection("fun", "baz"));
141 EXPECT_TRUE(SCL->inSection("fun", "zakaz"));
142 EXPECT_FALSE(SCL->inSection("fun", "zaraza"));
136143 }
144
145 TEST_F(SpecialCaseListTest, NoTrigramsInARule) {
146 std::unique_ptr SCL = makeSpecialCaseList("fun:*bar*\n"
147 "fun:za*az\n");
148 EXPECT_TRUE(SCL->inSection("fun", "abara"));
149 EXPECT_FALSE(SCL->inSection("fun", "bor"));
150 EXPECT_TRUE(SCL->inSection("fun", "zakaz"));
151 EXPECT_FALSE(SCL->inSection("fun", "zaraza"));
152 }
153
154 TEST_F(SpecialCaseListTest, RepetitiveRule) {
155 std::unique_ptr SCL = makeSpecialCaseList("fun:*bar*bar*bar*bar*\n"
156 "fun:bar*\n");
157 EXPECT_TRUE(SCL->inSection("fun", "bara"));
158 EXPECT_FALSE(SCL->inSection("fun", "abara"));
159 EXPECT_TRUE(SCL->inSection("fun", "barbarbarbar"));
160 EXPECT_TRUE(SCL->inSection("fun", "abarbarbarbar"));
161 EXPECT_FALSE(SCL->inSection("fun", "abarbarbar"));
162 }
163
164 TEST_F(SpecialCaseListTest, SpecialSymbolRule) {
165 std::unique_ptr SCL = makeSpecialCaseList("src:*c\\+\\+abi*\n");
166 EXPECT_TRUE(SCL->inSection("src", "c++abi"));
167 EXPECT_FALSE(SCL->inSection("src", "c\\+\\+abi"));
168 }
169
170 TEST_F(SpecialCaseListTest, PopularTrigram) {
171 std::unique_ptr SCL = makeSpecialCaseList("fun:*aaaaaa*\n"
172 "fun:*aaaaa*\n"
173 "fun:*aaaa*\n"
174 "fun:*aaa*\n");
175 EXPECT_TRUE(SCL->inSection("fun", "aaa"));
176 EXPECT_TRUE(SCL->inSection("fun", "aaaa"));
177 EXPECT_TRUE(SCL->inSection("fun", "aaaabbbaaa"));
178 }
179
180 }
0 //===- TrigramIndexTest.cpp - Unit tests for TrigramIndex -----------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "llvm/ADT/STLExtras.h"
10 #include "llvm/Support/TrigramIndex.h"
11 #include "gtest/gtest.h"
12
13 #include
14 #include
15
16 using namespace llvm;
17
18 namespace {
19
20 class TrigramIndexTest : public ::testing::Test {
21 protected:
22 std::unique_ptr makeTrigramIndex(
23 std::vector Rules) {
24 std::unique_ptr TI =
25 make_unique();
26 for (auto &Rule : Rules)
27 TI->insert(Rule);
28 return TI;
29 }
30 };
31
32 TEST_F(TrigramIndexTest, Empty) {
33 std::unique_ptr TI =
34 makeTrigramIndex({});
35 EXPECT_FALSE(TI->isDefeated());
36 EXPECT_TRUE(TI->isDefinitelyOut("foo"));
37 }
38
39 TEST_F(TrigramIndexTest, Basic) {
40 std::unique_ptr TI =
41 makeTrigramIndex({"*hello*", "*wor.d*"});
42 EXPECT_FALSE(TI->isDefeated());
43 EXPECT_TRUE(TI->isDefinitelyOut("foo"));
44 }
45
46 TEST_F(TrigramIndexTest, NoTrigramsInRules) {
47 std::unique_ptr TI =
48 makeTrigramIndex({"b.r", "za*az"});
49 EXPECT_TRUE(TI->isDefeated());
50 EXPECT_FALSE(TI->isDefinitelyOut("foo"));
51 EXPECT_FALSE(TI->isDefinitelyOut("bar"));
52 EXPECT_FALSE(TI->isDefinitelyOut("zakaz"));
53 }
54
55 TEST_F(TrigramIndexTest, NoTrigramsInARule) {
56 std::unique_ptr TI =
57 makeTrigramIndex({"*hello*", "*wo.ld*"});
58 EXPECT_TRUE(TI->isDefeated());
59 EXPECT_FALSE(TI->isDefinitelyOut("foo"));
60 }
61
62 TEST_F(TrigramIndexTest, RepetitiveRule) {
63 std::unique_ptr TI =
64 makeTrigramIndex({"*bar*bar*bar*bar*bar", "bar*bar"});
65 EXPECT_FALSE(TI->isDefeated());
66 EXPECT_TRUE(TI->isDefinitelyOut("foo"));
67 EXPECT_TRUE(TI->isDefinitelyOut("bar"));
68 EXPECT_FALSE(TI->isDefinitelyOut("barbara"));
69 EXPECT_FALSE(TI->isDefinitelyOut("bar+bar"));
70 }
71
72 TEST_F(TrigramIndexTest, PopularTrigram) {
73 std::unique_ptr TI =
74 makeTrigramIndex({"*aaa*", "*aaaa*", "*aaaaa*", "*aaaaa*", "*aaaaaa*"});
75 EXPECT_TRUE(TI->isDefeated());
76 }
77
78 TEST_F(TrigramIndexTest, PopularTrigram2) {
79 std::unique_ptr TI =
80 makeTrigramIndex({"class1.h", "class2.h", "class3.h", "class4.h", "class.h"});
81 EXPECT_TRUE(TI->isDefeated());
82 }
83
84 TEST_F(TrigramIndexTest, TooComplicatedRegex) {
85 std::unique_ptr TI =
86 makeTrigramIndex({"[0-9]+"});
87 EXPECT_TRUE(TI->isDefeated());
88 }
89
90 TEST_F(TrigramIndexTest, TooComplicatedRegex2) {
91 std::unique_ptr TI =
92 makeTrigramIndex({"foo|bar"});
93 EXPECT_TRUE(TI->isDefeated());
94 }
95
96 TEST_F(TrigramIndexTest, SpecialSymbol) {
97 std::unique_ptr TI =
98 makeTrigramIndex({"*c\\+\\+*"});
99 EXPECT_TRUE(TI->isDefeated());
100 }
101
102 TEST_F(TrigramIndexTest, Sequence) {
103 std::unique_ptr TI =
104 makeTrigramIndex({"class1.h", "class2.h", "class3.h", "class4.h"});
105 EXPECT_FALSE(TI->isDefeated());
106 EXPECT_FALSE(TI->isDefinitelyOut("class1"));
107 EXPECT_TRUE(TI->isDefinitelyOut("class.h"));
108 EXPECT_TRUE(TI->isDefinitelyOut("class"));
109 }
110
111 } // namespace