llvm.org GIT mirror llvm / 6b73148
Add backreference matching capabilities to Support/Regex, with appropriate unit tests. This change in itself is not expected to affect any functionality at this point, but it will serve as a stepping stone to improve FileCheck's variable matching capabilities. Luckily, our regex implementation already supports backreferences, although a bit of hacking is required to enable it. It supports both Basic Regular Expressions (BREs) and Extended Regular Expressions (EREs), without supporting backrefs for EREs, following POSIX strictly in this respect. And EREs is what we actually use (rightly). This is contrary to many implementations (including the default on Linux) of POSIX regexes, that do allow backrefs in EREs. Adding backref support to our EREs is a very simple change in the regcomp parsing code. I fail to think of significant cases where it would clash with existing things, and can bring more versatility to the regexes we write. There's always the danger of a backref in a specially crafted regex causing exponential matching times, but since we mainly use them for testing purposes I don't think it's a big problem. [it can also be placed behind a flag specific to FileCheck, if needed]. For more details, see: * http://lists.cs.uiuc.edu/pipermail/llvmdev/2012-November/055840.html * http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20121126/156878.html git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168802 91177308-0d34-0410-b5e6-96231b3b80d8 Eli Bendersky 7 years ago
4 changed file(s) with 64 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
66 //
77 //===----------------------------------------------------------------------===//
88 //
9 // This file implements a POSIX regular expression matcher.
9 // This file implements a POSIX regular expression matcher. Both Basic and
10 // Extended POSIX regular expressions (ERE) are supported. EREs were extended
11 // to support backreferences in matches.
12 // This implementation also supports matching strings with embedded NUL chars.
1013 //
1114 //===----------------------------------------------------------------------===//
1215
3235 /// null string after any newline in the string in addition to its normal
3336 /// function, and the $ anchor matches the null string before any
3437 /// newline in the string in addition to its normal function.
35 Newline=2
38 Newline=2,
39 /// By default, the POSIX extended regular expression (ERE) syntax is
40 /// assumed. Pass this flag to turn on basic regular expressions (BRE)
41 /// instead.
42 BasicRegex=4
3643 };
3744
38 /// Compiles the given POSIX Extended Regular Expression \p Regex.
39 /// This implementation supports regexes and matching strings with embedded
40 /// NUL characters.
45 /// Compiles the given regular expression \p Regex.
4146 Regex(StringRef Regex, unsigned Flags = NoFlags);
4247 ~Regex();
4348
2626 flags |= REG_ICASE;
2727 if (Flags & Newline)
2828 flags |= REG_NEWLINE;
29 error = llvm_regcomp(preg, regex.data(), flags|REG_EXTENDED|REG_PEND);
29 if (!(Flags & BasicRegex))
30 flags |= REG_EXTENDED;
31 error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
3032 }
3133
3234 Regex::~Regex() {
302302 sopno pos;
303303 int count;
304304 int count2;
305 int backrefnum;
305306 sopno subno;
306307 int wascaret = 0;
307308
369370 case '\\':
370371 REQUIRE(MORE(), REG_EESCAPE);
371372 c = GETNEXT();
372 ordinary(p, c);
373 if (c >= '1' && c <= '9') {
374 /* \[0-9] is taken to be a back-reference to a previously specified
375 * matching group. backrefnum will hold the number. The matching
376 * group must exist (i.e. if \4 is found there must have been at
377 * least 4 matching groups specified in the pattern previously).
378 */
379 backrefnum = c - '0';
380 if (p->pend[backrefnum] == 0) {
381 SETERROR(REG_ESUBREG);
382 break;
383 }
384
385 /* Make sure everything checks out and emit the sequence
386 * that marks a back-reference to the parse structure.
387 */
388 assert(backrefnum <= p->g->nsub);
389 EMIT(OBACK_, backrefnum);
390 assert(p->pbegin[backrefnum] != 0);
391 assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
392 assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
393 (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
394 EMIT(O_BACK, backrefnum);
395 p->g->backrefs = 1;
396 } else {
397 /* Other chars are simply themselves when escaped with a backslash.
398 */
399 ordinary(p, c);
400 }
373401 break;
374402 case '{': /* okay as ordinary except if digit follows */
375403 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
5050 EXPECT_EQ(1u, Matches.size());
5151 EXPECT_EQ(String, Matches[0].str());
5252
53
5453 std::string NulPattern="X[0-9]+X([a-f])?:([0-9]+)";
5554 String="YX99a:513b";
5655 NulPattern[7] = '\0';
5958 EXPECT_FALSE(r5.match("X9"));
6059 String[3]='\0';
6160 EXPECT_TRUE(r5.match(String));
61 }
62
63 TEST_F(RegexTest, Backreferences) {
64 Regex r1("([a-z]+)_\\1");
65 SmallVector Matches;
66 EXPECT_TRUE(r1.match("abc_abc", &Matches));
67 EXPECT_EQ(2u, Matches.size());
68 EXPECT_FALSE(r1.match("abc_ab", &Matches));
69
70 Regex r2("a([0-9])b\\1c\\1");
71 EXPECT_TRUE(r2.match("a4b4c4", &Matches));
72 EXPECT_EQ(2u, Matches.size());
73 EXPECT_EQ("4", Matches[1].str());
74 EXPECT_FALSE(r2.match("a2b2c3"));
75
76 Regex r3("a([0-9])([a-z])b\\1\\2");
77 EXPECT_TRUE(r3.match("a6zb6z", &Matches));
78 EXPECT_EQ(3u, Matches.size());
79 EXPECT_EQ("6", Matches[1].str());
80 EXPECT_EQ("z", Matches[2].str());
81 EXPECT_FALSE(r3.match("a6zb6y"));
82 EXPECT_FALSE(r3.match("a6zb7z"));
6283 }
6384
6485 TEST_F(RegexTest, Substitution) {